@@ -40,7 +40,7 @@ use mz_storage_client::controller::StorageTxn;
4040use mz_storage_types:: StorageDiff ;
4141use mz_storage_types:: sources:: SourceData ;
4242use timely:: progress:: { Antichain , Timestamp as TimelyTimestamp } ;
43- use tracing:: { debug, error, info} ;
43+ use tracing:: { debug, error, info, warn } ;
4444
4545use crate :: catalog:: open:: builtin_item_migration:: persist_schema:: { TableKey , TableKeySchema } ;
4646use crate :: catalog:: state:: LocalExpressionCache ;
@@ -402,10 +402,15 @@ async fn migrate_builtin_collections_incompatible(
402402 ) ;
403403 let mut global_id_shards: BTreeMap < _ , _ > = snapshot
404404 . into_iter ( )
405- . map ( |( ( key, value) , _ts, _diff) | {
406- let table_key = key. expect ( "persist decoding error" ) ;
407- let shard_id = value. expect ( "persist decoding error" ) ;
408- ( table_key, shard_id)
405+ . filter_map ( |( data, _ts, _diff) | {
406+ if let ( Ok ( table_key) , Ok ( shard_id) ) = data {
407+ Some ( ( table_key, shard_id) )
408+ } else {
409+ // If we can't decode the data, it has likely been written by a newer version, so
410+ // we ignore it.
411+ warn ! ( "skipping unreadable migration shard entry: {data:?}" ) ;
412+ None
413+ }
409414 } )
410415 . collect ( ) ;
411416
@@ -415,11 +420,25 @@ async fn migrate_builtin_collections_incompatible(
415420 let storage_collection_metadata = txn. get_collection_metadata ( ) ;
416421 for ( table_key, shard_id) in global_id_shards. clone ( ) {
417422 if table_key. build_version > build_version {
418- halt ! (
419- "saw build version {}, which is greater than current build version {}" ,
420- table_key. build_version,
421- build_version
422- ) ;
423+ if read_only {
424+ halt ! (
425+ "saw build version {}, which is greater than current build version {}" ,
426+ table_key. build_version,
427+ build_version
428+ ) ;
429+ } else {
430+ // If we are in leader mode, and a newer (read-only) version has started a
431+ // migration, we must not allow ourselves to get fenced out! Continuing here might
432+ // confuse any read-only process running the migrations concurrently, but it's
433+ // better for the read-only env to crash than the leader.
434+ // TODO(#9755): handle this in a more principled way
435+ warn ! (
436+ %table_key. build_version, %build_version,
437+ "saw build version which is greater than current build version" ,
438+ ) ;
439+ global_id_shards. remove ( & table_key) ;
440+ continue ;
441+ }
423442 }
424443
425444 if !migrated_storage_collections. contains ( & table_key. global_id )
0 commit comments