@@ -9502,6 +9502,37 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
95029502 don't set the lock, it will be too coarse. Indicate that LockingIterator
95039503 should be used, instead.
95049504
9505+ == RangeFlagsShouldBeFlippedForRevCF ==
9506+ When using reverse column families, the value of Endpoint::inf_suffix has
9507+ the reverse meaning.
9508+
9509+ Let's consider a forward-ordered CF and some keys and endpoints in it:
9510+
9511+ key=a, inf_suffix=false
9512+ key=ab
9513+ key=az
9514+ key=a, inf_suffix=true
9515+
9516+ Now, let's put the same data and endpoints into a reverse-ordered CF. The
9517+ physical order of the data will be the reverse of the above:
9518+
9519+ key=a, inf_suffix=true
9520+ key=az
9521+ key=ab
9522+ key=a, inf_suffix=false
9523+
9524+ Note that inf_suffix=false comes *before* any values with the same prefix.
9525+ And inf_suffix=true comes *after* all values with the same prefix.
9526+
9527+ The Endpoint comparison function in RocksDB doesn't "know" if the CF is
9528+ reverse-ordered or not. It uses the Key Comparator for key values, and
9529+ then it assumes that Endpoint(key=$VAL, inf_suffix=false) comes before
9530+ the row with key=$VAL.
9531+
9532+ The only way to achieve the required ordering is to flip the endpoint
9533+ flag value before passing class Endpoint to RocksDB. This function does
9534+ it before the lock_range() call.
9535+
95059536 @return
95069537 0 Ok
95079538 Other Error acquiring the lock (wait timeout, deadlock, etc)
@@ -9554,8 +9585,7 @@ int ha_rocksdb::set_range_lock(Rdb_transaction *tx,
95549585 end_has_inf_suffix= true;
95559586 end_slice= slice;
95569587 }
9557- }
9558- else if (find_flag == HA_READ_PREFIX_LAST_OR_PREV) {
9588+ } else if (find_flag == HA_READ_PREFIX_LAST_OR_PREV) {
95599589 /*
95609590 We get here for queries like:
95619591
@@ -9580,8 +9610,7 @@ int ha_rocksdb::set_range_lock(Rdb_transaction *tx,
95809610 end_slice_size);
95819611 start_has_inf_suffix= false;
95829612 end_has_inf_suffix= true;
9583- }
9584- else if (find_flag == HA_READ_BEFORE_KEY) {
9613+ } else if (find_flag == HA_READ_BEFORE_KEY) {
95859614 /*
95869615 We get here for queries like
95879616 select * from t1
@@ -9602,17 +9631,15 @@ int ha_rocksdb::set_range_lock(Rdb_transaction *tx,
96029631 slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
96039632 end_slice_size);
96049633
9605- end_has_inf_suffix= false;
9606- big_range= false;
9634+ // end_has_inf_suffix is false, because we're looking key<const
96079635 } else {
96089636 uint end_slice_size;
96099637 kd.get_infimum_key(end_slice_buf, &end_slice_size);
96109638 slice= rocksdb::Slice((char*)end_slice_buf, end_slice_size);
96119639
96129640 big_range= true;
96139641 }
9614- }
9615- else if (end_key) {
9642+ } else if (end_key) {
96169643 // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY
96179644 if (find_flag == HA_READ_KEY_OR_NEXT)
96189645 start_has_inf_suffix= false;
@@ -9644,9 +9671,7 @@ int ha_rocksdb::set_range_lock(Rdb_transaction *tx,
96449671
96459672 end_slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
96469673 end_slice_size);
9647- }
9648- else
9649- {
9674+ } else {
96509675 big_range= true;
96519676#if 0
96529677 // The below is code to handle this without LockingIterator:
@@ -9666,8 +9691,7 @@ int ha_rocksdb::set_range_lock(Rdb_transaction *tx,
96669691#endif
96679692 }
96689693
9669- if (big_range)
9670- {
9694+ if (big_range) {
96719695 *use_locking_iterator= true;
96729696 return 0;
96739697 }
@@ -9676,7 +9700,8 @@ int ha_rocksdb::set_range_lock(Rdb_transaction *tx,
96769700 rocksdb::Endpoint end_endp;
96779701
96789702 if (kd.m_is_reverse_cf) {
9679- // Flip the endpoints
9703+ // Flip the endpoint flag values, as explained in the
9704+ // RangeFlagsShouldBeFlippedForRevCF comment above.
96809705 start_endp =rocksdb::Endpoint(end_slice, !end_has_inf_suffix);
96819706 end_endp = rocksdb::Endpoint(slice, !start_has_inf_suffix);
96829707 } else {
0 commit comments