Simplify SpatialIndexCursor

Summary:
Since we have enough memory to hold all primary keys loaded from spatial index, it is better if we first load all of them (store them in unordered_set for deduplication) and then query on primary key column family one by one.

We need to dedup all IDs, so we'll end up storing all of them in memory even with the current approach.

Test Plan: ./spatial_db_test is happy

Reviewers: yinwang

Reviewed By: yinwang

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D20949
This commit is contained in:
Igor Canadi 2014-08-01 16:50:40 -04:00
parent 9c5a3f4746
commit 5e3d5c5f6e

View File

@ -13,6 +13,7 @@
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
#include <set> #include <set>
#include <unordered_set>
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
@ -244,23 +245,62 @@ std::string FeatureSet::DebugString() const {
class SpatialIndexCursor : public Cursor { class SpatialIndexCursor : public Cursor {
public: public:
// tile_box is inclusive
SpatialIndexCursor(Iterator* spatial_iterator, Iterator* data_iterator, SpatialIndexCursor(Iterator* spatial_iterator, Iterator* data_iterator,
const BoundingBox<uint64_t>& tile_bbox, uint32_t tile_bits) const BoundingBox<uint64_t>& tile_bbox, uint32_t tile_bits)
: spatial_iterator_(spatial_iterator), : data_iterator_(data_iterator),
data_iterator_(data_iterator),
tile_bbox_(tile_bbox),
tile_bits_(tile_bits),
valid_(true) { valid_(true) {
current_x_ = tile_bbox.min_x; // calculate quad keys we'll need to query
current_y_ = tile_bbox.min_y; std::vector<uint64_t> quad_keys;
UpdateQuadKey(); quad_keys.reserve((tile_bbox.max_x - tile_bbox.min_x + 1) *
ReSeek(); (tile_bbox.max_y - tile_bbox.min_y + 1));
if (valid_) { for (uint64_t x = tile_bbox.min_x; x <= tile_bbox.max_x; ++x) {
// this is the first ID returned, so I don't care about return value of for (uint64_t y = tile_bbox.min_y; y <= tile_bbox.max_y; ++y) {
// Dedup quad_keys.push_back(GetQuadKeyFromTile(x, y, tile_bits));
Dedup(); }
} }
std::sort(quad_keys.begin(), quad_keys.end());
// load primary key ids for all quad keys
for (auto quad_key : quad_keys) {
std::string encoded_quad_key;
PutFixed64BigEndian(&encoded_quad_key, quad_key);
Slice slice_quad_key(encoded_quad_key);
// If CheckQuadKey is true, there is no need to reseek, since
// spatial_iterator is already pointing at the correct quad key. This is
// an optimization.
if (!CheckQuadKey(spatial_iterator, slice_quad_key)) {
spatial_iterator->Seek(slice_quad_key);
}
while (CheckQuadKey(spatial_iterator, slice_quad_key)) {
// extract ID from spatial_iterator
uint64_t id;
bool ok = GetFixed64BigEndian(
Slice(spatial_iterator->key().data() + sizeof(uint64_t),
sizeof(uint64_t)),
&id);
if (!ok) {
valid_ = false;
status_ = Status::Corruption("Spatial index corruption");
break;
}
primary_key_ids_.insert(id);
spatial_iterator->Next();
}
}
if (!spatial_iterator->status().ok()) {
status_ = spatial_iterator->status();
valid_ = false;
}
delete spatial_iterator;
valid_ = valid_ && primary_key_ids_.size() > 0;
if (valid_) { if (valid_) {
primary_keys_iterator_ = primary_key_ids_.begin();
ExtractData(); ExtractData();
} }
} }
@ -270,28 +310,13 @@ class SpatialIndexCursor : public Cursor {
virtual void Next() override { virtual void Next() override {
assert(valid_); assert(valid_);
// this do-while loop deals only with deduplication ++primary_keys_iterator_;
do { if (primary_keys_iterator_ == primary_key_ids_.end()) {
spatial_iterator_->Next(); valid_ = false;
if (ExtractID()) { return;
// OK, found what we needed
continue;
}
// move to the next tile
Increment();
if (ExtractID()) {
// no need to reseek, found what we needed
continue;
}
// reseek, find next good tile
ReSeek();
} while (valid_ && !Dedup() && valid_);
if (valid_) {
ExtractData();
} }
ExtractData();
} }
virtual const Slice blob() override { return current_blob_; } virtual const Slice blob() override { return current_blob_; }
@ -303,88 +328,44 @@ class SpatialIndexCursor : public Cursor {
if (!status_.ok()) { if (!status_.ok()) {
return status_; return status_;
} }
if (!spatial_iterator_->status().ok()) {
return spatial_iterator_->status();
}
return data_iterator_->status(); return data_iterator_->status();
} }
private: private:
// returns true if OK, false if already returned (duplicate)
bool Dedup() {
assert(valid_);
uint64_t id;
bool ok = GetFixed64BigEndian(current_id_, &id);
if (!ok) {
valid_ = false;
status_ = Status::Corruption("Spatial index corruption");
return false;
}
if (returned_ids_.find(id) != returned_ids_.end()) {
return false;
}
returned_ids_.insert(id);
return true;
}
void ReSeek() {
while (valid_) {
spatial_iterator_->Seek(current_quad_key_);
if (ExtractID()) {
// found what we're looking for!
break;
}
Increment();
}
}
void Increment() {
++current_x_;
if (current_x_ > tile_bbox_.max_x) {
current_x_ = tile_bbox_.min_x;
++current_y_;
}
if (current_y_ > tile_bbox_.max_y) {
valid_ = false;
} else {
UpdateQuadKey();
}
}
void UpdateQuadKey() {
current_quad_key_.clear();
PutFixed64BigEndian(&current_quad_key_,
GetQuadKeyFromTile(current_x_, current_y_, tile_bits_));
}
// * returns true if spatial iterator is on the current quad key and all is // * returns true if spatial iterator is on the current quad key and all is
// well. Caller will call Next() to get new data // well
// * returns false if spatial iterator is not on current, or invalid or status // * returns false if spatial iterator is not on current, or iterator is
// bad. Caller will need to reseek to get new data // invalid or corruption
bool ExtractID() { bool CheckQuadKey(Iterator* spatial_iterator, const Slice& quad_key) {
if (!spatial_iterator_->Valid()) { if (!spatial_iterator->Valid()) {
// caller needs to reseek
return false; return false;
} }
if (spatial_iterator_->key().size() != 2 * sizeof(uint64_t)) { if (spatial_iterator->key().size() != 2 * sizeof(uint64_t)) {
status_ = Status::Corruption("Invalid spatial index key"); status_ = Status::Corruption("Invalid spatial index key");
valid_ = false; valid_ = false;
return false; return false;
} }
Slice quad_key(spatial_iterator_->key().data(), sizeof(uint64_t)); Slice spatial_iterator_quad_key(spatial_iterator->key().data(),
if (quad_key != current_quad_key_) { sizeof(uint64_t));
if (spatial_iterator_quad_key != quad_key) {
// caller needs to reseek // caller needs to reseek
return false; return false;
} }
// if we come to here, we have found the quad key // if we come to here, we have found the quad key
current_id_ = Slice(spatial_iterator_->key().data() + sizeof(uint64_t),
sizeof(uint64_t));
return true; return true;
} }
// doesn't return anything, but sets valid_ and status_ on corruption // doesn't return anything, but sets valid_ and status_ on corruption
void ExtractData() { void ExtractData() {
assert(valid_); assert(valid_);
data_iterator_->Seek(current_id_); std::string encoded_id;
PutFixed64BigEndian(&encoded_id, *primary_keys_iterator_);
if (!data_iterator_->Valid() || data_iterator_->key() != current_id_) { data_iterator_->Seek(encoded_id);
status_ = Status::Corruption("Inconsistency in data column family");
if (!data_iterator_->Valid() ||
data_iterator_->key() != Slice(encoded_id)) {
status_ = Status::Corruption("Index inconsistency");
valid_ = false; valid_ = false;
return; return;
} }
@ -393,28 +374,22 @@ class SpatialIndexCursor : public Cursor {
current_feature_set_.Clear(); current_feature_set_.Clear();
if (!GetLengthPrefixedSlice(&data, &current_blob_) || if (!GetLengthPrefixedSlice(&data, &current_blob_) ||
!current_feature_set_.Deserialize(data)) { !current_feature_set_.Deserialize(data)) {
status_ = Status::Corruption("Data column family corruption"); status_ = Status::Corruption("Primary key column family corruption");
valid_ = false; valid_ = false;
return; return;
} }
} }
unique_ptr<Iterator> spatial_iterator_;
unique_ptr<Iterator> data_iterator_; unique_ptr<Iterator> data_iterator_;
BoundingBox<uint64_t> tile_bbox_;
uint32_t tile_bits_;
uint64_t current_x_;
uint64_t current_y_;
std::string current_quad_key_;
Slice current_id_;
bool valid_; bool valid_;
Status status_; Status status_;
FeatureSet current_feature_set_; FeatureSet current_feature_set_;
Slice current_blob_; Slice current_blob_;
// used for deduplicating results // This is loaded from spatial iterator.
std::set<uint64_t> returned_ids_; std::unordered_set<uint64_t> primary_key_ids_;
std::unordered_set<uint64_t>::iterator primary_keys_iterator_;
}; };
class ErrorCursor : public Cursor { class ErrorCursor : public Cursor {