#include "BPlusTree.h"
#include <file_systems/QueryParserUtils.h>
#include "Debug.h"
#include "Utility.h"
#if !_BOOT_MODE
# include "Inode.h"
#else
# include "Stream.h"
# define Inode BFS::Stream
# define strerror(x) "error message unavailable"
namespace BFS {
#endif
struct duplicate_array {
off_t count;
off_t values[0];
inline bool IsEmpty() const
{
return count == 0;
}
inline int32 Count() const
{
return (int32)BFS_ENDIAN_TO_HOST_INT64(count);
}
inline off_t ValueAt(uint32 index) const
{
return BFS_ENDIAN_TO_HOST_INT64(values[index]);
}
inline void SetValueAt(uint32 index, off_t value)
{
values[index] = HOST_ENDIAN_TO_BFS_INT64(value);
}
inline int32 Find(off_t value) const
{
int32 i;
return _FindInternal(value, i) ? i : -1;
}
void Insert(off_t value);
bool Remove(off_t value);
private:
bool _FindInternal(off_t value, int32& index) const;
} _PACKED;
#ifdef DEBUG
class NodeChecker {
public:
NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
:
fNode(node),
fSize(nodeSize),
fText(text)
{
Check("integrity check failed on construction.");
}
~NodeChecker()
{
Check("integrity check failed on destruction.");
}
void
Check(const char* message)
{
if (fNode->CheckIntegrity(fSize) != B_OK) {
dprintf("%s: %s\n", fText, message);
DEBUGGER(("NodeChecker integrity check failed!"));
}
}
private:
const bplustree_node* fNode;
int32 fSize;
const char* fText;
};
#endif
#if !_BOOT_MODE
class BitmapArray {
public:
BitmapArray(size_t numBits);
~BitmapArray();
status_t InitCheck() const;
bool IsSet(size_t index) const;
void Set(size_t index, bool set);
size_t CountSet() const { return fCountSet; }
private:
uint8* fBitmap;
size_t fSize;
size_t fCountSet;
};
struct TreeCheck {
TreeCheck(BPlusTree* tree)
:
fLevelCount(0),
fFreeCount(0),
fNodeSize(tree->NodeSize()),
fMaxLevels(tree->fHeader.MaxNumberOfLevels()),
fFoundErrors(0),
fVisited(tree->Stream()->Size() / tree->NodeSize()),
fVisitedFragment(tree->Stream()->Size() / tree->NodeSize())
{
fPreviousOffsets = (off_t*)malloc(
sizeof(off_t) * tree->fHeader.MaxNumberOfLevels());
if (fPreviousOffsets != NULL) {
for (size_t i = 0; i < fMaxLevels; i++)
fPreviousOffsets[i] = BPLUSTREE_NULL;
}
}
~TreeCheck()
{
free(fPreviousOffsets);
}
status_t InitCheck() const
{
if (fPreviousOffsets == NULL)
return B_NO_MEMORY;
status_t status = fVisited.InitCheck();
if (status != B_OK)
return status;
return fVisitedFragment.InitCheck();
}
bool Visited(off_t offset) const
{
return fVisited.IsSet(offset / fNodeSize);
}
void SetVisited(off_t offset)
{
fVisited.Set(offset / fNodeSize, true);
}
size_t VisitedCount() const
{
return fVisited.CountSet();
}
bool VisitedFragment(off_t offset) const
{
return fVisitedFragment.IsSet(offset / fNodeSize);
}
void SetVisitedFragment(off_t offset)
{
fVisitedFragment.Set(offset / fNodeSize, true);
}
uint32 MaxLevels() const
{
return fLevelCount;
}
void SetLevel(uint32 level)
{
if (fLevelCount < level)
fLevelCount = level;
}
off_t PreviousOffset(uint32 level)
{
return fPreviousOffsets[level];
}
void SetPreviousOffset(uint32 level, off_t offset)
{
fPreviousOffsets[level] = offset;
}
void FoundError()
{
fFoundErrors++;
}
bool ErrorsFound()
{
return fFoundErrors != 0;
}
private:
uint32 fLevelCount;
uint32 fFreeCount;
uint32 fNodeSize;
uint32 fMaxLevels;
uint32 fFoundErrors;
BitmapArray fVisited;
BitmapArray fVisitedFragment;
off_t* fPreviousOffsets;
};
void
CachedNode::UnsetUnchanged(Transaction& transaction)
{
if (fTree == NULL || fTree->fStream == NULL)
return;
if (fNode != NULL) {
void* cache = fTree->fStream->GetVolume()->BlockCache();
block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
block_cache_put(cache, fBlockNumber);
fBlock = NULL;
fNode = NULL;
}
}
#endif
void
CachedNode::Unset()
{
if (fTree == NULL || fTree->fStream == NULL)
return;
if (fNode != NULL) {
#if !_BOOT_MODE
if (fWritable && fOffset == 0) {
memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header));
}
block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
fBlockNumber);
fBlock = NULL;
#endif
fNode = NULL;
}
}
const bplustree_node*
CachedNode::SetTo(off_t offset, bool check)
{
const bplustree_node* node;
if (SetTo(offset, &node, check) == B_OK)
return node;
return NULL;
}
status_t
CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check)
{
if (fTree == NULL || fTree->fStream == NULL)
RETURN_ERROR(B_BAD_VALUE);
if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
|| offset <= 0
|| (offset % fTree->fNodeSize) != 0) {
Unset();
RETURN_ERROR(B_BAD_VALUE);
}
status_t status = InternalSetTo(NULL, offset);
if (status != B_OK)
return status;
if (check) {
if (!fTree->fHeader.CheckNode(fNode)) {
FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
fBlockNumber, fTree->fStream->ID()));
Unset();
return B_BAD_DATA;
}
}
*_node = fNode;
return B_OK;
}
#if !_BOOT_MODE
bplustree_node*
CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
{
if (fTree == NULL || fTree->fStream == NULL) {
REPORT_ERROR(B_BAD_VALUE);
return NULL;
}
if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
|| offset <= 0
|| (offset % fTree->fNodeSize) != 0) {
Unset();
return NULL;
}
if (InternalSetTo(&transaction, offset) != B_OK)
return NULL;
if (check) {
if (!fTree->fHeader.CheckNode(fNode)) {
FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
fBlockNumber, fTree->fStream->ID()));
Unset();
return NULL;
}
}
return fNode;
}
bplustree_node*
CachedNode::MakeWritable(Transaction& transaction)
{
if (fNode == NULL)
return NULL;
if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
fBlockNumber, transaction.ID()) == B_OK) {
return fNode;
}
return NULL;
}
#endif
const bplustree_header*
CachedNode::SetToHeader()
{
if (fTree == NULL || fTree->fStream == NULL) {
REPORT_ERROR(B_BAD_VALUE);
return NULL;
}
InternalSetTo(NULL, 0LL);
return (bplustree_header*)fNode;
}
#if !_BOOT_MODE
bplustree_header*
CachedNode::SetToWritableHeader(Transaction& transaction)
{
if (fTree == NULL || fTree->fStream == NULL) {
REPORT_ERROR(B_BAD_VALUE);
return NULL;
}
InternalSetTo(&transaction, 0LL);
if (fNode != NULL && !fTree->fInTransaction) {
transaction.AddListener(fTree);
fTree->fInTransaction = true;
if (!transaction.GetVolume()->IsInitializing()
&& fTree->fStream != transaction.GetVolume()->IndicesNode()) {
acquire_vnode(transaction.GetVolume()->FSVolume(),
fTree->fStream->ID());
}
}
return (bplustree_header*)fNode;
}
#endif
status_t
CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
{
if (offset >= fTree->fStream->Size()) {
Unset();
RETURN_ERROR(B_BAD_VALUE);
}
off_t fileOffset;
block_run run;
status_t status = fTree->fStream->FindBlockRun(offset, run, fileOffset);
if (status != B_OK) {
Unset();
return status;
}
#if !_BOOT_MODE
Volume* volume = fTree->fStream->GetVolume();
#else
Volume* volume = &fTree->fStream->GetVolume();
#endif
const int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
const off_t newBlockNumber = volume->ToBlock(run) + blockOffset;
uint8* block = NULL;
if (transaction == NULL && fBlock != NULL && fBlockNumber == newBlockNumber) {
block = fBlock;
} else {
#if !_BOOT_MODE
if (fBlock != NULL)
Unset();
if (transaction != NULL) {
block = fBlock = (uint8*)block_cache_get_writable(volume->BlockCache(),
newBlockNumber, transaction->ID());
fWritable = true;
} else {
block = fBlock = (uint8*)block_cache_get(volume->BlockCache(), newBlockNumber);
fWritable = false;
}
#else
if (fBlock == NULL) {
fBlock = (uint8*)malloc(volume->BlockSize());
if (fBlock == NULL)
RETURN_ERROR(B_NO_MEMORY);
}
if (read_pos(volume->Device(), newBlockNumber << volume->BlockShift(),
fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) {
block = fBlock;
}
fWritable = false;
#endif
}
fOffset = offset;
fBlockNumber = newBlockNumber;
if (block != NULL) {
fNode = (bplustree_node*)(block + offset
- (fileOffset + (blockOffset << volume->BlockShift())));
} else {
fNode = NULL;
RETURN_ERROR(B_IO_ERROR);
}
return B_OK;
}
#if !_BOOT_MODE
status_t
CachedNode::Free(Transaction& transaction, off_t offset)
{
if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
RETURN_ERROR(B_BAD_VALUE);
CachedNode cached(fTree);
bplustree_header* header = cached.SetToWritableHeader(transaction);
if (header == NULL)
return B_IO_ERROR;
#if 0
off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
if (offset == lastOffset) {
status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
if (status != B_OK)
return status;
header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
return B_OK;
}
#endif
fNode->left_link = header->free_node_pointer;
fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
return B_OK;
}
status_t
CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
off_t* _offset)
{
if (fTree == NULL || fTree->fStream == NULL)
RETURN_ERROR(B_BAD_VALUE);
Unset();
bplustree_header* header;
status_t status;
if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) {
CachedNode cached(fTree);
header = cached.SetToWritableHeader(transaction);
if (header == NULL)
return B_IO_ERROR;
*_offset = header->FreeNode();
*_node = fNode;
header->free_node_pointer = fNode->left_link;
fNode->Initialize();
return B_OK;
}
Inode* stream = fTree->fStream;
if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK)
return status;
CachedNode cached(fTree);
header = cached.SetToWritableHeader(transaction);
if (header == NULL)
return B_IO_ERROR;
off_t offset = header->MaximumSize();
header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
+ fTree->fNodeSize);
cached.Unset();
if (SetToWritable(transaction, offset, false) == NULL)
RETURN_ERROR(B_ERROR);
fNode->Initialize();
*_offset = offset;
*_node = fNode;
return B_OK;
}
BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
:
fStream(NULL),
fInTransaction(false)
{
mutex_init(&fIteratorLock, "bfs b+tree iterator");
SetTo(transaction, stream);
}
#endif
BPlusTree::BPlusTree(Inode* stream)
:
fStream(NULL),
fInTransaction(false)
{
#if !_BOOT_MODE
mutex_init(&fIteratorLock, "bfs b+tree iterator");
#endif
SetTo(stream);
}
BPlusTree::BPlusTree()
:
fStream(NULL),
fNodeSize(BPLUSTREE_NODE_SIZE),
fAllowDuplicates(true),
fInTransaction(false),
fStatus(B_NO_INIT)
{
#if !_BOOT_MODE
mutex_init(&fIteratorLock, "bfs b+tree iterator");
#endif
}
BPlusTree::~BPlusTree()
{
#if !_BOOT_MODE
mutex_lock(&fIteratorLock);
SinglyLinkedList<TreeIterator>::ConstIterator iterator
= fIterators.GetIterator();
while (iterator.HasNext())
iterator.Next()->Stop();
mutex_destroy(&fIteratorLock);
ASSERT(!fInTransaction);
#endif
}
#if !_BOOT_MODE
status_t
BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
{
fStream = stream;
CachedNode cached(this);
bplustree_header* header = cached.SetToWritableHeader(transaction);
if (header == NULL) {
fStatus = stream->SetFileSize(transaction, nodeSize * 2);
if (fStatus != B_OK)
RETURN_ERROR(fStatus);
header = cached.SetToWritableHeader(transaction);
if (header == NULL)
RETURN_ERROR(fStatus = B_ERROR);
}
fAllowDuplicates = stream->IsIndex()
|| (stream->Mode() & S_ALLOW_DUPS) != 0;
fNodeSize = nodeSize;
header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
header->free_node_pointer
= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
cached.Unset();
cached.SetToWritable(transaction, fHeader.RootNode(), false);
if (cached.Node() == NULL)
RETURN_ERROR(B_IO_ERROR);
cached.Node()->Initialize();
return fStatus = B_OK;
}
#endif
status_t
BPlusTree::SetTo(Inode* stream)
{
if (stream == NULL)
RETURN_ERROR(fStatus = B_BAD_VALUE);
fStream = stream;
CachedNode cached(this);
const bplustree_header* header = cached.SetToHeader();
if (header != NULL)
memcpy(&fHeader, header, sizeof(bplustree_header));
else
RETURN_ERROR(fStatus = B_IO_ERROR);
if (fHeader.MaximumSize() != stream->Size()) {
dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %"
B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size());
}
if (!fHeader.IsValid()) {
#ifdef DEBUG
dump_bplustree_header(&fHeader);
dump_block((const char*)&fHeader, 128);
#endif
RETURN_ERROR(fStatus = B_BAD_DATA);
}
fNodeSize = fHeader.NodeSize();
static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
S_DOUBLE_INDEX};
uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
| S_FLOAT_INDEX | S_DOUBLE_INDEX);
if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE
|| ((stream->Mode() & S_INDEX_DIR) != 0
&& kToMode[fHeader.DataType()] != mode)
|| !stream->IsContainer()) {
D( dump_bplustree_header(&fHeader);
dump_inode(&stream->Node());
);
RETURN_ERROR(fStatus = B_BAD_TYPE);
}
fAllowDuplicates = stream->IsIndex()
|| (stream->Mode() & S_ALLOW_DUPS) != 0;
cached.SetTo(fHeader.RootNode());
RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
}
status_t
BPlusTree::InitCheck()
{
return fStatus;
}
#if !_BOOT_MODE
status_t
BPlusTree::Validate(bool repair, bool& _errorsFound)
{
TreeCheck check(this);
if (check.InitCheck() != B_OK)
return B_NO_MEMORY;
check.SetVisited(0);
CachedNode cached(this);
off_t freeOffset = fHeader.FreeNode();
while (freeOffset > 0) {
const bplustree_node* node;
status_t status = cached.SetTo(freeOffset, &node, false);
if (status != B_OK) {
if (status == B_IO_ERROR)
return B_IO_ERROR;
dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could "
"not be read: %s\n", fStream->ID(), freeOffset,
strerror(status));
check.FoundError();
break;
}
if (check.Visited(freeOffset)) {
dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
" circular!\n", fStream->ID(), freeOffset);
check.FoundError();
break;
}
check.SetVisited(freeOffset);
if (node->OverflowLink() != BPLUSTREE_FREE) {
dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
" misses free mark!\n", fStream->ID(), freeOffset);
}
freeOffset = node->LeftLink();
}
const bplustree_node* root;
status_t status = cached.SetTo(fHeader.RootNode(), &root, true);
if (status != B_OK)
return status;
status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root);
if (check.ErrorsFound())
_errorsFound = true;
if (status != B_OK)
return status;
if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) {
dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, "
"declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(),
fHeader.MaxNumberOfLevels());
}
if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) {
dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF
" nodes.\n", fStream->ID(), check.VisitedCount(),
fHeader.MaximumSize() / fNodeSize);
}
return B_OK;
}
status_t
BPlusTree::MakeEmpty()
{
Transaction transaction(fStream->GetVolume(), fStream->BlockNumber());
fStream->WriteLockInTransaction(transaction);
CachedNode cached(this);
bplustree_header* header = cached.SetToWritableHeader(transaction);
if (header == NULL)
return B_IO_ERROR;
header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize());
if (fStream->Size() > (off_t)NodeSize() * 2)
header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize());
else {
header->free_node_pointer
= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
}
bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false);
if (node == NULL)
return B_IO_ERROR;
node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
node->all_key_count = 0;
node->all_key_length = 0;
for (off_t offset = 2 * NodeSize(); offset < fStream->Size();
offset += NodeSize()) {
bplustree_node* node = cached.SetToWritable(transaction, offset, false);
if (node == NULL) {
dprintf("--> could not open %" B_PRIdOFF "\n", offset);
return B_IO_ERROR;
}
if (offset < fStream->Size() - (off_t)NodeSize())
node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize());
else
node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
if (offset % (1024 * 1024) == 0) {
transaction.Done();
transaction.Start(fStream->GetVolume(), fStream->BlockNumber());
fStream->WriteLockInTransaction(transaction);
}
}
return transaction.Done();
}
int32
BPlusTree::TypeCodeToKeyType(type_code code)
{
switch (code) {
case B_STRING_TYPE:
return BPLUSTREE_STRING_TYPE;
case B_SSIZE_T_TYPE:
case B_INT32_TYPE:
return BPLUSTREE_INT32_TYPE;
case B_SIZE_T_TYPE:
case B_UINT32_TYPE:
return BPLUSTREE_UINT32_TYPE;
case B_OFF_T_TYPE:
case B_INT64_TYPE:
return BPLUSTREE_INT64_TYPE;
case B_UINT64_TYPE:
return BPLUSTREE_UINT64_TYPE;
case B_FLOAT_TYPE:
return BPLUSTREE_FLOAT_TYPE;
case B_DOUBLE_TYPE:
return BPLUSTREE_DOUBLE_TYPE;
}
return -1;
}
int32
BPlusTree::ModeToKeyType(mode_t mode)
{
switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
| S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
case S_INT_INDEX:
return BPLUSTREE_INT32_TYPE;
case S_UINT_INDEX:
return BPLUSTREE_UINT32_TYPE;
case S_LONG_LONG_INDEX:
return BPLUSTREE_INT64_TYPE;
case S_ULONG_LONG_INDEX:
return BPLUSTREE_UINT64_TYPE;
case S_FLOAT_INDEX:
return BPLUSTREE_FLOAT_TYPE;
case S_DOUBLE_INDEX:
return BPLUSTREE_DOUBLE_TYPE;
case S_STR_INDEX:
default:
return BPLUSTREE_STRING_TYPE;
}
}
#endif
#if !_BOOT_MODE
void
BPlusTree::TransactionDone(bool success)
{
if (!success) {
CachedNode cached(this);
const bplustree_header* header = cached.SetToHeader();
if (header != NULL)
memcpy(&fHeader, header, sizeof(bplustree_header));
}
}
void
BPlusTree::RemovedFromTransaction()
{
fInTransaction = false;
if (!fStream->GetVolume()->IsInitializing() && fStream != fStream->GetVolume()->IndicesNode())
put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID());
}
void
BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
uint16 splitAt, int8 change)
{
MutexLocker _(fIteratorLock);
SinglyLinkedList<TreeIterator>::ConstIterator iterator
= fIterators.GetIterator();
while (iterator.HasNext())
iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
}
void
BPlusTree::_AddIterator(TreeIterator* iterator)
{
MutexLocker _(fIteratorLock);
fIterators.Add(iterator);
}
void
BPlusTree::_RemoveIterator(TreeIterator* iterator)
{
MutexLocker _(fIteratorLock);
fIterators.Remove(iterator);
}
#endif
int32
BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
int keyLength2)
{
type_code type = 0;
switch (fHeader.DataType()) {
case BPLUSTREE_STRING_TYPE:
type = B_STRING_TYPE;
break;
case BPLUSTREE_INT32_TYPE:
type = B_INT32_TYPE;
break;
case BPLUSTREE_UINT32_TYPE:
type = B_UINT32_TYPE;
break;
case BPLUSTREE_INT64_TYPE:
type = B_INT64_TYPE;
break;
case BPLUSTREE_UINT64_TYPE:
type = B_UINT64_TYPE;
break;
case BPLUSTREE_FLOAT_TYPE:
type = B_FLOAT_TYPE;
break;
case BPLUSTREE_DOUBLE_TYPE:
type = B_DOUBLE_TYPE;
break;
}
return QueryParser::compareKeys(type, key1, keyLength1, key2, keyLength2);
}
status_t
BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
uint16 keyLength, uint16* _index, off_t* _next)
{
#ifdef DEBUG
NodeChecker checker(node, fNodeSize, "find");
#endif
if (node->all_key_count == 0) {
if (_index)
*_index = 0;
if (_next)
*_next = node->OverflowLink();
return B_ENTRY_NOT_FOUND;
}
Unaligned<off_t>* values = node->Values();
int16 saveIndex = -1;
for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
uint16 i = (first + last) >> 1;
uint16 searchLength = 0;
uint8* searchKey = node->KeyAt(i, &searchLength);
if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
> (uint8*)node + fNodeSize
|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
#if !_BOOT_MODE
fStream->GetVolume()->Panic();
#endif
RETURN_ERROR(B_BAD_DATA);
}
int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
if (cmp < 0) {
last = i - 1;
saveIndex = i;
} else if (cmp > 0) {
saveIndex = first = i + 1;
} else {
if (_index)
*_index = i;
if (_next)
*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
return B_OK;
}
}
if (_index)
*_index = saveIndex;
if (_next) {
if (saveIndex == node->NumKeys())
*_next = node->OverflowLink();
else
*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
}
return B_ENTRY_NOT_FOUND;
}
#if !_BOOT_MODE
status_t
BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
uint16 keyLength)
{
node_and_key nodeAndKey;
nodeAndKey.nodeOffset = fHeader.RootNode();
CachedNode cached(this);
const bplustree_node* node;
while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
if (node->OverflowLink() == BPLUSTREE_NULL) {
nodeAndKey.keyIndex = 0;
stack.Push(nodeAndKey);
return B_OK;
}
off_t nextOffset;
status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
&nextOffset);
if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
RETURN_ERROR(B_ERROR);
if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) {
dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
break;
}
stack.Push(nodeAndKey);
nodeAndKey.nodeOffset = nextOffset;
}
FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %"
B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID()));
return B_ERROR;
}
status_t
BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
const bplustree_node* node, CachedNode& cached,
off_t* _offset, bplustree_node** _fragment, uint32* _index)
{
Unaligned<off_t>* values = node->Values();
for (int32 i = 0; i < node->NumKeys(); i++) {
off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
continue;
const bplustree_node* fragment = cached.SetTo(
bplustree_node::FragmentOffset(value), false);
if (fragment == NULL) {
FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %"
B_PRIdOFF "\n", value, fStream->ID()));
continue;
}
uint32 num = bplustree_node::MaxFragments(fNodeSize);
for (uint32 j = 0; j < num; j++) {
duplicate_array* array = fragment->FragmentAt(j);
if (array->IsEmpty()) {
*_fragment = cached.MakeWritable(transaction);
if (*_fragment == NULL)
return B_IO_ERROR;
*_offset = bplustree_node::FragmentOffset(value);
*_index = j;
return B_OK;
}
}
}
return B_ENTRY_NOT_FOUND;
}
status_t
BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
const bplustree_node* node, uint16 index, off_t value)
{
CachedNode cachedDuplicate(this);
Unaligned<off_t>* values = node->Values();
off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
status_t status;
off_t offset;
if (bplustree_node::IsDuplicate(oldValue)) {
if (bplustree_node::LinkType(oldValue)
== BPLUSTREE_DUPLICATE_FRAGMENT) {
bplustree_node* duplicate = cachedDuplicate.SetToWritable(
transaction, bplustree_node::FragmentOffset(oldValue), false);
if (duplicate == NULL)
return B_IO_ERROR;
duplicate_array* array = duplicate->FragmentAt(
bplustree_node::FragmentIndex(oldValue));
int32 arrayCount = array->Count();
if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) {
FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment "
"%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
(int)bplustree_node::FragmentIndex(oldValue),
bplustree_node::FragmentOffset(oldValue), arrayCount,
fStream->ID()));
return B_BAD_DATA;
}
if (arrayCount < NUM_FRAGMENT_VALUES) {
array->Insert(value);
} else {
if (duplicate->FragmentsUsed(fNodeSize) < 2) {
offset = bplustree_node::FragmentOffset(oldValue);
memmove(duplicate->DuplicateArray(), array,
(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
duplicate->left_link = duplicate->right_link
= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
array = duplicate->DuplicateArray();
array->Insert(value);
} else {
cachedDuplicate.UnsetUnchanged(transaction);
bplustree_node* newDuplicate;
status = cachedDuplicate.Allocate(transaction,
&newDuplicate, &offset);
if (status != B_OK)
RETURN_ERROR(status);
newDuplicate->overflow_link = array->count;
memcpy(&newDuplicate->all_key_count, &array->values[0],
array->Count() * sizeof(off_t));
memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
array = newDuplicate->DuplicateArray();
array->Insert(value);
}
if (cached.MakeWritable(transaction) == NULL)
return B_IO_ERROR;
values[index]
= HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
BPLUSTREE_DUPLICATE_NODE, offset));
}
return B_OK;
}
duplicate_array* array;
int32 arrayCount;
const bplustree_node* duplicate;
off_t duplicateOffset;
do {
duplicateOffset = bplustree_node::FragmentOffset(oldValue);
duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
if (duplicate == NULL)
return B_IO_ERROR;
array = duplicate->DuplicateArray();
arrayCount =array->Count();
if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
FATAL(("_InsertDuplicate: Invalid array size in duplicate %"
B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
duplicateOffset, arrayCount, fStream->ID()));
return B_BAD_DATA;
}
} while (arrayCount >= NUM_DUPLICATE_VALUES
&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
bplustree_node* writableDuplicate
= cachedDuplicate.MakeWritable(transaction);
if (writableDuplicate == NULL)
return B_IO_ERROR;
if (arrayCount < NUM_DUPLICATE_VALUES) {
array = writableDuplicate->DuplicateArray();
array->Insert(value);
} else {
bplustree_node* newDuplicate;
status = cachedDuplicate.Allocate(transaction, &newDuplicate,
&offset);
if (status != B_OK)
RETURN_ERROR(status);
writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
array = newDuplicate->DuplicateArray();
array->count = 0;
array->Insert(value);
}
return B_OK;
}
uint32 fragmentIndex = 0;
bplustree_node* fragment;
if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
&offset, &fragment, &fragmentIndex) != B_OK) {
status = cachedDuplicate.Allocate(transaction, &fragment, &offset);
if (status != B_OK)
RETURN_ERROR(status);
memset(fragment, 0, fNodeSize);
}
duplicate_array* array = fragment->FragmentAt(fragmentIndex);
array->Insert(oldValue);
array->Insert(value);
if (cached.MakeWritable(transaction) == NULL)
return B_IO_ERROR;
values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
return B_OK;
}
void
BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
uint16 keyLength, off_t value)
{
if (index > node->NumKeys())
return;
Unaligned<off_t>* values = node->Values();
Unaligned<uint16>* keyLengths = node->KeyLengths();
uint8* keys = node->Keys();
node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
+ keyLength);
Unaligned<off_t>* newValues = node->Values();
Unaligned<uint16>* newKeyLengths = node->KeyLengths();
memmove(newValues + index + 1, values + index,
sizeof(off_t) * (node->NumKeys() - 1 - index));
memmove(newValues, values, sizeof(off_t) * index);
newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
for (uint16 i = node->NumKeys(); i-- > index + 1;) {
newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
}
memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
int32 keyStart;
newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
+ (keyStart = index > 0
? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
int32 size = node->AllKeyLength() - length;
if (size > 0)
memmove(keys + length, keys + length - keyLength, size);
memcpy(keys + keyStart, key, keyLength);
}
status_t
BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
uint16* _keyLength, off_t* _value)
{
if (*_keyIndex > node->NumKeys() + 1)
return B_BAD_VALUE;
Unaligned<uint16>* inKeyLengths = node->KeyLengths();
Unaligned<off_t>* inKeyValues = node->Values();
uint8* inKeys = node->Keys();
uint8* outKeys = other->Keys();
int32 keyIndex = *_keyIndex;
if (keyIndex > node->NumKeys()) {
FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF
"\n", (int)keyIndex, node->NumKeys(), fStream->ID()));
return B_BAD_VALUE;
}
int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
size_t size = fNodeSize >> 1;
int32 out, in;
size_t keyLengths = 0;
for (in = out = 0; in < node->NumKeys() + 1;) {
keyLengths = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]);
if (in == keyIndex && !bytes) {
bytes = *_keyLength;
bytesBefore = in > 0
? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
} else {
if (keyIndex < out)
bytesAfter = keyLengths - bytesBefore;
in++;
}
out++;
if (key_align(sizeof(bplustree_node) + bytes + keyLengths)
+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
break;
}
}
if (keyIndex >= out && in > 0)
bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
else if (keyIndex + 1 < out)
bytesAfter = keyLengths - bytesBefore;
if (bytesBefore < 0 || bytesAfter < 0)
return B_BAD_DATA;
other->left_link = node->left_link;
other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
+ bytesAfter);
other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
Unaligned<uint16>* outKeyLengths = other->KeyLengths();
Unaligned<off_t>* outKeyValues = other->Values();
int32 keys = out > keyIndex ? keyIndex : out;
if (bytesBefore) {
memcpy(outKeys, inKeys, bytesBefore);
memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
}
if (bytes) {
memcpy(outKeys + bytesBefore, key, bytes);
outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
if (bytesAfter) {
memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
bytesAfter);
keys = out - keyIndex - 1;
for (int32 i = 0;i < keys;i++) {
outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
+ bytes);
}
memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
keys * sizeof(off_t));
}
}
if (in != out)
keyIndex--;
int32 total = bytesBefore + bytesAfter;
uint8* newKey = NULL;
uint16 newLength = 0;
bool newAllocated = false;
if (node->OverflowLink() != BPLUSTREE_NULL) {
if (in == keyIndex) {
newKey = key;
newLength = *_keyLength;
other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
keyIndex--;
} else {
uint8* droppedKey = node->KeyAt(in, &newLength);
if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
> (uint8*)node + fNodeSize
|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
fStream->GetVolume()->Panic();
RETURN_ERROR(B_BAD_DATA);
}
newKey = (uint8*)malloc(newLength);
if (newKey == NULL)
return B_NO_MEMORY;
newAllocated = true;
memcpy(newKey, droppedKey, newLength);
other->overflow_link = inKeyValues[in];
total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
}
}
bytesBefore = bytesAfter = bytes = 0;
out = 0;
int32 skip = in;
while (in < node->NumKeys() + 1) {
if (in == keyIndex && !bytes) {
bytesBefore = in > skip
? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
bytes = *_keyLength;
out++;
} else {
if (in < node->NumKeys()) {
inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
if (bytes) {
inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
- bytesBefore - bytes;
}
out++;
}
in++;
}
}
if (keyIndex < skip)
bytesBefore = node->AllKeyLength() - total;
if (bytesBefore < 0 || bytesAfter < 0) {
if (newAllocated)
free(newKey);
return B_BAD_DATA;
}
node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
+ bytesAfter);
node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
outKeyLengths = node->KeyLengths();
outKeyValues = node->Values();
keys = keyIndex <= skip ? out : keyIndex - skip;
keyIndex -= skip;
in = out - keyIndex - 1;
if (bytesBefore)
memmove(inKeys, inKeys + total, bytesBefore);
if (bytesAfter) {
memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
bytesAfter);
}
if (bytesBefore)
memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
if (bytesAfter) {
memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
in * sizeof(uint16));
}
if (bytesBefore)
memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
if (bytesAfter) {
memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
in * sizeof(off_t));
}
if (bytes) {
memcpy(inKeys + bytesBefore, key, bytes);
outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
}
if (newKey == NULL)
newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
memcpy(key, newKey, newLength);
*_keyLength = newLength;
*_value = otherOffset;
if (newAllocated)
free(newKey);
return B_OK;
}
status_t
BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
off_t value)
{
if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
RETURN_ERROR(B_BAD_VALUE);
#ifdef DEBUG
if (value < 0)
panic("tried to insert invalid value %" B_PRId64 "!\n", value);
#endif
ASSERT_WRITE_LOCKED_INODE(fStream);
Stack<node_and_key> stack;
if (_SeekDown(stack, key, keyLength) != B_OK)
RETURN_ERROR(B_ERROR);
uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH];
memcpy(keyBuffer, key, keyLength);
node_and_key nodeAndKey;
const bplustree_node* node;
CachedNode cached(this);
while (stack.Pop(&nodeAndKey)
&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
#ifdef DEBUG
NodeChecker checker(node, fNodeSize, "insert");
#endif
if (node->IsLeaf()) {
status_t status = _FindKey(node, key, keyLength,
&nodeAndKey.keyIndex);
if (status == B_OK) {
if (fAllowDuplicates) {
status = _InsertDuplicate(transaction, cached, node,
nodeAndKey.keyIndex, value);
if (status != B_OK)
RETURN_ERROR(status);
return B_OK;
}
return B_NAME_IN_USE;
}
}
bplustree_node* writableNode = cached.MakeWritable(transaction);
if (writableNode == NULL)
return B_IO_ERROR;
if (int32(key_align(sizeof(bplustree_node)
+ writableNode->AllKeyLength() + keyLength)
+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
+ sizeof(off_t))) < fNodeSize) {
_InsertKey(writableNode, nodeAndKey.keyIndex,
keyBuffer, keyLength, value);
_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
nodeAndKey.keyIndex, 0, 1);
return B_OK;
} else {
CachedNode cachedNewRoot(this);
CachedNode cachedOther(this);
off_t newRoot = BPLUSTREE_NULL;
if (nodeAndKey.nodeOffset == fHeader.RootNode()) {
bplustree_node* root;
status_t status = cachedNewRoot.Allocate(transaction, &root,
&newRoot);
if (status != B_OK) {
RETURN_ERROR(status);
}
}
bplustree_node* other;
off_t otherOffset;
status_t status = cachedOther.Allocate(transaction, &other,
&otherOffset);
if (status != B_OK) {
cachedNewRoot.Free(transaction, newRoot);
RETURN_ERROR(status);
}
if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
&value) != B_OK) {
cachedOther.Free(transaction, otherOffset);
cachedNewRoot.Free(transaction, newRoot);
RETURN_ERROR(B_ERROR);
}
#ifdef DEBUG
checker.Check("insert split");
NodeChecker otherChecker(other, fNodeSize, "insert split other");
#endif
_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
if ((other = cachedOther.SetToWritable(transaction,
other->LeftLink())) != NULL) {
other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
}
if (newRoot != BPLUSTREE_NULL) {
bplustree_node* root = cachedNewRoot.Node();
_InsertKey(root, 0, keyBuffer, keyLength,
writableNode->LeftLink());
root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
nodeAndKey.nodeOffset);
CachedNode cached(this);
bplustree_header* header
= cached.SetToWritableHeader(transaction);
if (header == NULL)
return B_IO_ERROR;
header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
header->MaxNumberOfLevels() + 1);
return B_OK;
}
}
}
RETURN_ERROR(B_ERROR);
}
status_t
BPlusTree::_RemoveDuplicate(Transaction& transaction,
const bplustree_node* node, CachedNode& cached, uint16 index,
off_t value)
{
Unaligned<off_t>* values = node->Values();
off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
CachedNode cachedDuplicate(this);
off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
duplicateOffset, false);
if (duplicate == NULL)
return B_IO_ERROR;
if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
duplicate_array* array = duplicate->FragmentAt(
bplustree_node::FragmentIndex(oldValue));
int32 arrayCount = array->Count();
if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) {
FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %"
B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset,
arrayCount, fStream->ID()));
return B_BAD_DATA;
}
if (!array->Remove(value)) {
FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node "
"%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value,
duplicateOffset, fStream->ID()));
return B_ENTRY_NOT_FOUND;
}
if (--arrayCount == 1) {
if (cached.MakeWritable(transaction) == NULL)
return B_IO_ERROR;
values[index] = array->values[0];
if (duplicate->FragmentsUsed(fNodeSize) == 1) {
status_t status = cachedDuplicate.Free(transaction,
duplicateOffset);
if (status != B_OK)
return status;
} else
array->count = 0;
}
return B_OK;
}
duplicate_array* array = NULL;
int32 arrayCount = 0;
if (duplicate->LeftLink() != BPLUSTREE_NULL) {
FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF
", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID()));
return B_BAD_DATA;
}
while (duplicate != NULL) {
array = duplicate->DuplicateArray();
arrayCount = array->Count();
if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
FATAL(("_RemoveDuplicate: Invalid array size in duplicate %"
B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
duplicateOffset, arrayCount, fStream->ID()));
return B_BAD_DATA;
}
if (array->Remove(value)) {
arrayCount--;
break;
}
if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
RETURN_ERROR(B_ENTRY_NOT_FOUND);
cachedDuplicate.UnsetUnchanged(transaction);
duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
false);
}
if (duplicate == NULL)
RETURN_ERROR(B_IO_ERROR);
while (true) {
off_t left = duplicate->LeftLink();
off_t right = duplicate->RightLink();
bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
if ((isLast && arrayCount == 1) || arrayCount == 0) {
if (left == BPLUSTREE_NULL) {
if (cached.MakeWritable(transaction) == NULL)
return B_IO_ERROR;
if (arrayCount == 1) {
values[index] = array->values[0];
} else {
values[index] = HOST_ENDIAN_TO_BFS_INT64(
bplustree_node::MakeLink(
BPLUSTREE_DUPLICATE_NODE, right));
}
}
status_t status = cachedDuplicate.Free(transaction,
duplicateOffset);
if (status != B_OK)
return status;
if (left != BPLUSTREE_NULL
&& (duplicate = cachedDuplicate.SetToWritable(transaction, left,
false)) != NULL) {
duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
array = duplicate->DuplicateArray();
arrayCount = array->Count();
if (right == BPLUSTREE_NULL
&& duplicate->LeftLink() == BPLUSTREE_NULL
&& arrayCount <= NUM_FRAGMENT_VALUES) {
duplicateOffset = left;
continue;
}
}
if (right != BPLUSTREE_NULL
&& (duplicate = cachedDuplicate.SetToWritable(transaction,
right, false)) != NULL) {
duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
array = duplicate->DuplicateArray();
arrayCount = array->Count();
if (left == BPLUSTREE_NULL
&& duplicate->RightLink() == BPLUSTREE_NULL
&& arrayCount <= NUM_FRAGMENT_VALUES) {
duplicateOffset = right;
continue;
}
}
return B_OK;
}
if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) {
CachedNode cachedOther(this);
bplustree_node* fragment = NULL;
uint32 fragmentIndex = 0;
off_t offset;
if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
&offset, &fragment, &fragmentIndex) == B_OK) {
duplicate_array* target = fragment->FragmentAt(fragmentIndex);
memcpy(target, array,
(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
cachedDuplicate.Free(transaction, duplicateOffset);
duplicateOffset = offset;
} else {
memmove(duplicate, array,
(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
}
if (cached.MakeWritable(transaction) == NULL)
return B_IO_ERROR;
values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
}
return B_OK;
}
}
void
BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
{
if (index > node->NumKeys() && node->NumKeys() > 0) {
FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF
"\n", index, fStream->ID()));
return;
}
Unaligned<off_t>* values = node->Values();
if (!node->IsLeaf() && index == node->NumKeys())
node->overflow_link = values[--index];
uint16 length = 0;
uint8* key = node->KeyAt(index, &length);
if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
|| length > BPLUSTREE_MAX_KEY_LENGTH) {
FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length,
fStream->ID()));
fStream->GetVolume()->Panic();
return;
}
Unaligned<uint16>* keyLengths = node->KeyLengths();
uint8* keys = node->Keys();
node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
node->AllKeyLength() - length);
Unaligned<off_t>* newValues = node->Values();
Unaligned<uint16>* newKeyLengths = node->KeyLengths();
memmove(key, key + length, node->AllKeyLength() - (key - keys));
if (index > 0 && newKeyLengths != keyLengths)
memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
for (uint16 i = index; i < node->NumKeys(); i++) {
newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
}
if (index > 0)
memmove(newValues, values, index * sizeof(off_t));
if (node->NumKeys() > index) {
memmove(newValues + index, values + index + 1,
(node->NumKeys() - index) * sizeof(off_t));
}
}
status_t
BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
off_t value)
{
if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
RETURN_ERROR(B_BAD_VALUE);
ASSERT_WRITE_LOCKED_INODE(fStream);
Stack<node_and_key> stack;
if (_SeekDown(stack, key, keyLength) != B_OK)
RETURN_ERROR(B_ERROR);
node_and_key nodeAndKey;
const bplustree_node* node;
CachedNode cached(this);
while (stack.Pop(&nodeAndKey)
&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
#ifdef DEBUG
NodeChecker checker(node, fNodeSize, "remove");
#endif
if (node->IsLeaf()) {
status_t status = _FindKey(node, key, keyLength,
&nodeAndKey.keyIndex);
if (status != B_OK)
RETURN_ERROR(status);
if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
node->Values()[nodeAndKey.keyIndex]))) {
if (fAllowDuplicates) {
return _RemoveDuplicate(transaction, node, cached,
nodeAndKey.keyIndex, value);
}
FATAL(("dupliate node found where no duplicates are "
"allowed, inode %" B_PRIdOFF "!\n", fStream->ID()));
RETURN_ERROR(B_ERROR);
} else {
if (node->Values()[nodeAndKey.keyIndex] != value)
return B_ENTRY_NOT_FOUND;
off_t next = node->RightLink() == BPLUSTREE_NULL
? BPLUSTREE_FREE : node->RightLink();
_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
}
}
bplustree_node* writableNode = cached.MakeWritable(transaction);
if (writableNode == NULL)
return B_IO_ERROR;
if (nodeAndKey.nodeOffset == fHeader.RootNode()
&& (node->NumKeys() == 0
|| (node->NumKeys() == 1 && node->IsLeaf()))) {
writableNode->overflow_link
= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
writableNode->all_key_count = 0;
writableNode->all_key_length = 0;
if (fHeader.MaxNumberOfLevels() != 1) {
CachedNode cached(this);
bplustree_header* header
= cached.SetToWritableHeader(transaction);
if (header == NULL)
return B_IO_ERROR;
header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
}
return B_OK;
}
if (writableNode->NumKeys() > 1
|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
_RemoveKey(writableNode, nodeAndKey.keyIndex);
return B_OK;
}
CachedNode otherCached(this);
bplustree_node* other = otherCached.SetToWritable(transaction,
writableNode->LeftLink());
if (other != NULL)
other->right_link = writableNode->right_link;
if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
!= NULL) {
other->left_link = writableNode->left_link;
}
cached.Free(transaction, nodeAndKey.nodeOffset);
}
RETURN_ERROR(B_ERROR);
}
status_t
BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
off_t value)
{
if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
|| key == NULL)
RETURN_ERROR(B_BAD_VALUE);
if (fAllowDuplicates)
RETURN_ERROR(B_BAD_TYPE);
ASSERT_WRITE_LOCKED_INODE(fStream);
off_t nodeOffset = fHeader.RootNode();
CachedNode cached(this);
const bplustree_node* node;
while ((node = cached.SetTo(nodeOffset)) != NULL) {
uint16 keyIndex = 0;
off_t nextOffset;
status_t status = _FindKey(node, key, keyLength, &keyIndex,
&nextOffset);
if (node->OverflowLink() == BPLUSTREE_NULL) {
if (status == B_OK) {
bplustree_node* writableNode = cached.MakeWritable(transaction);
if (writableNode != NULL) {
writableNode->Values()[keyIndex]
= HOST_ENDIAN_TO_BFS_INT64(value);
} else
status = B_IO_ERROR;
}
return status;
} else if (nextOffset == nodeOffset)
RETURN_ERROR(B_ERROR);
nodeOffset = nextOffset;
}
RETURN_ERROR(B_ERROR);
}
#endif
status_t
BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
{
if (key == NULL || keyLength < BPLUSTREE_MIN_KEY_LENGTH)
RETURN_ERROR(B_BAD_VALUE);
if (keyLength > BPLUSTREE_MAX_KEY_LENGTH)
RETURN_ERROR(B_NAME_TOO_LONG);
if (fAllowDuplicates)
RETURN_ERROR(B_BAD_TYPE);
#if !_BOOT_MODE
ASSERT_READ_LOCKED_INODE(fStream);
#endif
off_t nodeOffset = fHeader.RootNode();
CachedNode cached(this);
const bplustree_node* node;
#ifdef DEBUG
int32 levels = 0;
#endif
while ((node = cached.SetTo(nodeOffset)) != NULL) {
uint16 keyIndex = 0;
off_t nextOffset;
status_t status = _FindKey(node, key, keyLength, &keyIndex,
&nextOffset);
#ifdef DEBUG
levels++;
#endif
if (node->OverflowLink() == BPLUSTREE_NULL) {
if (status == B_OK && _value != NULL)
*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
#ifdef DEBUG
if (levels != (int32)fHeader.MaxNumberOfLevels())
DEBUGGER(("levels don't match"));
#endif
return status;
} else if (nextOffset == nodeOffset)
RETURN_ERROR(B_ERROR);
nodeOffset = nextOffset;
}
FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %"
B_PRIdOFF "\n", nodeOffset, fStream->ID()));
RETURN_ERROR(B_ERROR);
}
#if !_BOOT_MODE
status_t
BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset,
const uint8* largestKey, uint16 largestKeyLength,
const bplustree_node* parent)
{
if (parent->CheckIntegrity(fNodeSize) != B_OK) {
dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check "
"failed!\n", fStream->ID(), offset);
check.FoundError();
return B_OK;
}
if (level >= fHeader.MaxNumberOfLevels()) {
dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF
"!\n", fStream->ID(), offset);
check.FoundError();
return B_OK;
}
check.SetLevel(level);
if (check.Visited(offset)) {
dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n",
fStream->ID(), offset);
check.FoundError();
return B_OK;
}
check.SetVisited(offset);
uint32 count = parent->NumKeys();
Unaligned<off_t>* values = parent->Values();
off_t lastOffset = check.PreviousOffset(level);
CachedNode cached(this);
for (uint32 i = 0; i < count; i++) {
uint16 keyLength;
uint8* key = parent->KeyAt(i, &keyLength);
if (largestKey != NULL) {
int result = _CompareKeys(key, keyLength, largestKey,
largestKeyLength);
if (result > 0 || (result == 0 && i != count - 1)) {
dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %"
B_PRIu32 " larger than it should!\n",
fStream->ID(), offset, i);
check.FoundError();
}
}
off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]);
if (bplustree_node::IsDuplicate(childOffset)) {
off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset);
off_t lastDuplicateOffset = BPLUSTREE_NULL;
while (duplicateOffset != BPLUSTREE_NULL) {
const bplustree_node* node;
status_t status = cached.SetTo(duplicateOffset, &node, false);
if (status != B_OK) {
if (status == B_IO_ERROR)
return B_IO_ERROR;
dprintf("inode %" B_PRIdOFF ": duplicate node at %"
B_PRIdOFF " could not be read: %s\n", fStream->ID(),
duplicateOffset, strerror(status));
check.FoundError();
break;
}
bool isFragmentNode = bplustree_node::LinkType(childOffset)
== BPLUSTREE_DUPLICATE_FRAGMENT;
bool isKnownFragment = isFragmentNode
&& check.VisitedFragment(duplicateOffset);
if (!isKnownFragment && check.Visited(duplicateOffset)) {
dprintf("inode %" B_PRIdOFF ": %s node at %"
B_PRIdOFF " already visited, referenced from %"
B_PRIdOFF "!\n", fStream->ID(),
isFragmentNode ? "fragment" : "duplicate",
duplicateOffset, offset);
check.FoundError();
break;
}
if (!check.Visited(duplicateOffset))
check.SetVisited(duplicateOffset);
if (!isKnownFragment && isFragmentNode)
check.SetVisitedFragment(duplicateOffset);
duplicate_array* array;
int32 minSize;
int32 maxSize;
if (isFragmentNode) {
array = node->FragmentAt(
bplustree_node::FragmentIndex(childOffset));
minSize = 2;
maxSize = NUM_FRAGMENT_VALUES;
} else {
array = node->DuplicateArray();
minSize = 1;
maxSize = NUM_DUPLICATE_VALUES;
}
int32 arrayCount = array->Count();
if (arrayCount < minSize || arrayCount > maxSize) {
dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
" has invalid array size %" B_PRId32 "!\n",
fStream->ID(), duplicateOffset, arrayCount);
check.FoundError();
} else {
for (int32 j = 0; j < arrayCount; j++) {
if (!fStream->GetVolume()->IsValidInodeBlock(
array->ValueAt(j))) {
dprintf("inode %" B_PRIdOFF ": duplicate at %"
B_PRIdOFF " contains invalid block %" B_PRIdOFF
" at %" B_PRId32 "!\n", fStream->ID(),
duplicateOffset, array->ValueAt(j), j);
check.FoundError();
break;
}
}
}
if (isFragmentNode)
break;
if (node->LeftLink() != lastDuplicateOffset) {
dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
" has wrong left link %" B_PRIdOFF ", expected %"
B_PRIdOFF "!\n", fStream->ID(), duplicateOffset,
node->LeftLink(), lastDuplicateOffset);
check.FoundError();
}
lastDuplicateOffset = duplicateOffset;
duplicateOffset = node->RightLink();
}
} else if (!parent->IsLeaf()) {
off_t nextOffset = parent->OverflowLink();
if (i < count - 1)
nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]);
if (i == 0 && lastOffset != BPLUSTREE_NULL) {
const bplustree_node* previous = cached.SetTo(lastOffset, true);
if (previous == NULL)
return B_IO_ERROR;
if (previous->RightLink() != childOffset) {
dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
"!\n", fStream->ID(), lastOffset, previous->RightLink(),
childOffset);
check.FoundError();
}
}
status_t status = _ValidateChild(check, cached, level, childOffset,
lastOffset, nextOffset, key, keyLength);
if (status != B_OK)
return status;
} else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) {
dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains "
"invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n",
fStream->ID(), offset, childOffset, i);
check.FoundError();
}
lastOffset = childOffset;
}
if (parent->OverflowLink() != BPLUSTREE_NULL) {
off_t childOffset = parent->OverflowLink();
status_t status = _ValidateChild(check, cached, level, childOffset,
lastOffset, 0, NULL, 0);
if (status != B_OK)
return status;
lastOffset = childOffset;
}
check.SetPreviousOffset(level, lastOffset);
return B_OK;
}
status_t
BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level,
off_t offset, off_t lastOffset, off_t nextOffset,
const uint8* key, uint16 keyLength)
{
const bplustree_node* node;
status_t status = cached.SetTo(offset, &node, true);
if (status != B_OK) {
if (status == B_IO_ERROR)
return B_IO_ERROR;
dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be "
"read: %s\n", fStream->ID(), offset, strerror(status));
check.FoundError();
return B_OK;
}
if (node->LeftLink() != lastOffset) {
dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
"wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF
"!\n", fStream->ID(), offset, node->LeftLink(), lastOffset);
check.FoundError();
}
if (nextOffset != 0 && node->RightLink() != nextOffset) {
dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
"!\n", fStream->ID(), offset, node->RightLink(), nextOffset);
check.FoundError();
}
return _ValidateChildren(check, level + 1, offset, key, keyLength, node);
}
#endif
TreeIterator::TreeIterator(BPlusTree* tree)
:
fTree(tree),
fCurrentNodeOffset(BPLUSTREE_NULL)
{
#if !_BOOT_MODE
tree->_AddIterator(this);
#endif
}
TreeIterator::~TreeIterator()
{
#if !_BOOT_MODE
if (fTree)
fTree->_RemoveIterator(this);
#endif
}
status_t
TreeIterator::Goto(int8 to)
{
if (fTree == NULL || fTree->fStream == NULL)
RETURN_ERROR(B_BAD_VALUE);
#if !_BOOT_MODE
InodeReadLocker locker(fTree->fStream);
#endif
off_t nodeOffset = fTree->fHeader.RootNode();
CachedNode cached(fTree);
const bplustree_node* node;
while ((node = cached.SetTo(nodeOffset)) != NULL) {
if (node->OverflowLink() == BPLUSTREE_NULL) {
fCurrentNodeOffset = nodeOffset;
fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
fDuplicateNode = BPLUSTREE_NULL;
return B_OK;
}
off_t nextOffset;
if (to == BPLUSTREE_END || node->all_key_count == 0)
nextOffset = node->OverflowLink();
else {
if (node->AllKeyLength() > fTree->fNodeSize
|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
- 8 * node->NumKeys())
RETURN_ERROR(B_ERROR);
nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
}
if (nextOffset == nodeOffset)
break;
nodeOffset = nextOffset;
}
FATAL(("%s fails\n", __FUNCTION__));
RETURN_ERROR(B_ERROR);
}
status_t
TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
uint16 maxLength, off_t* value, uint16* duplicate)
{
if (fTree == NULL)
return B_INTERRUPTED;
bool forward = direction == BPLUSTREE_FORWARD;
if (fCurrentNodeOffset == BPLUSTREE_NULL
&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK)
RETURN_ERROR(B_ERROR);
if (fCurrentNodeOffset == BPLUSTREE_FREE)
return B_ENTRY_NOT_FOUND;
#if !_BOOT_MODE
InodeReadLocker locker(fTree->fStream);
#endif
CachedNode cached(fTree);
const bplustree_node* node;
if (fDuplicateNode != BPLUSTREE_NULL) {
if (!fIsFragment || fDuplicate < fNumDuplicates) {
node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
false);
} else
node = NULL;
if (node != NULL) {
if (!fIsFragment && fDuplicate >= fNumDuplicates) {
fDuplicateNode = node->RightLink();
if (fDuplicateNode != BPLUSTREE_NULL
&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
fNumDuplicates = node->CountDuplicates(fDuplicateNode,
false);
fDuplicate = 0;
}
}
if (fDuplicate < fNumDuplicates) {
*value = node->DuplicateAt(fDuplicateNode, fIsFragment,
fDuplicate++);
if (duplicate)
*duplicate = 2;
return B_OK;
}
}
fDuplicateNode = BPLUSTREE_NULL;
}
off_t savedNodeOffset = fCurrentNodeOffset;
int32 savedKey = fCurrentKey;
if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
RETURN_ERROR(B_ERROR);
if (duplicate)
*duplicate = 0;
fCurrentKey += direction;
while ((forward && fCurrentKey >= node->NumKeys())
|| (!forward && fCurrentKey < 0)) {
fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
if (fCurrentNodeOffset != BPLUSTREE_NULL) {
node = cached.SetTo(fCurrentNodeOffset);
if (!node)
RETURN_ERROR(B_ERROR);
fCurrentKey = forward ? 0 : node->NumKeys() - 1;
} else {
fCurrentNodeOffset = savedNodeOffset;
fCurrentKey = savedKey;
return B_ENTRY_NOT_FOUND;
}
}
if (node->all_key_count == 0)
RETURN_ERROR(B_ERROR);
uint16 length = 0;
uint8* keyStart = node->KeyAt(fCurrentKey, &length);
if (keyStart + length + sizeof(off_t) + sizeof(uint16)
> (uint8*)node + fTree->fNodeSize
|| length > BPLUSTREE_MAX_KEY_LENGTH) {
#if !_BOOT_MODE
fTree->fStream->GetVolume()->Panic();
#endif
RETURN_ERROR(B_BAD_DATA);
}
bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE;
if (length + (needsTermination ? 1 : 0) > maxLength) {
if (!needsTermination || maxLength < INODE_FILE_NAME_LENGTH) {
fCurrentNodeOffset = savedNodeOffset;
fCurrentKey = savedKey;
return B_BUFFER_OVERFLOW;
}
length = maxLength - 1;
}
memcpy(key, keyStart, length);
if (needsTermination)
((char*)key)[length] = '\0';
*keyLength = length;
off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
uint8 type = bplustree_node::LinkType(offset);
if (type == BPLUSTREE_DUPLICATE_FRAGMENT
|| type == BPLUSTREE_DUPLICATE_NODE) {
fDuplicateNode = offset;
node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
false);
if (node == NULL)
RETURN_ERROR(B_ERROR);
fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
if (fNumDuplicates) {
offset = node->DuplicateAt(offset, fIsFragment, 0);
fDuplicate = 1;
if (duplicate)
*duplicate = 1;
} else {
fDuplicateNode = BPLUSTREE_NULL;
offset = 0;
}
}
*value = offset;
return B_OK;
}
status_t
TreeIterator::Find(const uint8* key, uint16 keyLength)
{
if (fTree == NULL)
return B_INTERRUPTED;
if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
|| key == NULL)
RETURN_ERROR(B_BAD_VALUE);
#if !_BOOT_MODE
InodeReadLocker locker(fTree->fStream);
#endif
off_t nodeOffset = fTree->fHeader.RootNode();
CachedNode cached(fTree);
const bplustree_node* node;
while ((node = cached.SetTo(nodeOffset)) != NULL) {
uint16 keyIndex = 0;
off_t nextOffset = 0;
status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
&nextOffset);
if (node->OverflowLink() == BPLUSTREE_NULL) {
fCurrentNodeOffset = nodeOffset;
fCurrentKey = keyIndex - 1;
fDuplicateNode = BPLUSTREE_NULL;
return status;
} else if (nextOffset == nodeOffset)
RETURN_ERROR(B_ERROR);
nodeOffset = nextOffset;
}
RETURN_ERROR(B_ERROR);
}
void
TreeIterator::SkipDuplicates()
{
fDuplicateNode = BPLUSTREE_NULL;
}
void
TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
uint16 splitAt, int8 change)
{
if (offset != fCurrentNodeOffset)
return;
if (nextOffset != BPLUSTREE_NULL) {
fCurrentNodeOffset = nextOffset;
if (splitAt <= fCurrentKey) {
fCurrentKey -= splitAt;
keyIndex -= splitAt;
}
}
if (keyIndex <= fCurrentKey)
fCurrentKey += change;
}
void
TreeIterator::Stop()
{
fTree = NULL;
}
#ifdef DEBUG
void
TreeIterator::Dump()
{
__out("TreeIterator at %p:\n", this);
__out("\tfTree = %p\n", fTree);
__out("\tfCurrentNodeOffset = %" B_PRId64 "\n", fCurrentNodeOffset);
__out("\tfCurrentKey = %d\n", (int)fCurrentKey);
__out("\tfDuplicateNode = %" B_PRId64 " (%" B_PRId64 ", 0x%" B_PRIx64 ")\n",
bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
fDuplicateNode);
__out("\tfDuplicate = %u\n", fDuplicate);
__out("\tfNumDuplicates = %u\n", fNumDuplicates);
__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
}
#endif
bool
bplustree_header::IsValid() const
{
return Magic() == BPLUSTREE_MAGIC
&& (RootNode() % NodeSize()) == 0
&& IsValidLink(RootNode())
&& IsValidLink(FreeNode());
}
void
bplustree_node::Initialize()
{
left_link = right_link = overflow_link
= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
all_key_count = 0;
all_key_length = 0;
}
uint8*
bplustree_node::KeyAt(int32 index, uint16* keyLength) const
{
if (index < 0 || index > NumKeys())
return NULL;
uint8* keyStart = Keys();
Unaligned<uint16>* keyLengths = KeyLengths();
*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
if (index > 0)
keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
return keyStart;
}
uint8
bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
{
if (isFragment) {
uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
return ((off_t*)this)[fragment];
}
return OverflowLink();
}
off_t
bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
{
uint32 start;
if (isFragment)
start = 8 * ((uint64)offset & 0x3ff);
else
start = 2;
return ((off_t*)this)[start + 1 + index];
}
uint32
bplustree_node::FragmentsUsed(uint32 nodeSize) const
{
uint32 used = 0;
for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
duplicate_array* array = FragmentAt(i);
if (array->Count() > 0 && ++used > 1)
return used;
}
return used;
}
status_t
bplustree_node::CheckIntegrity(uint32 nodeSize) const
{
if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
DEBUGGER(("invalid node: key/length count"));
for (int32 i = 0; i < NumKeys(); i++) {
uint16 length = 0;
uint8* key = KeyAt(i, &length);
if (key + length + sizeof(off_t) + sizeof(uint16)
> (uint8*)this + nodeSize
|| length > BPLUSTREE_MAX_KEY_LENGTH) {
dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i);
return B_BAD_DATA;
}
if (Values()[i] == -1) {
dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values "
"corrupted\n", this, (int)i, Values()[i].value);
return B_BAD_DATA;
}
}
return B_OK;
}
#if !_BOOT_MODE
BitmapArray::BitmapArray(size_t numBits)
{
fSize = (numBits + 7) / 8;
fBitmap = (uint8*)calloc(fSize, 1);
fCountSet = 0;
}
BitmapArray::~BitmapArray()
{
free(fBitmap);
}
status_t
BitmapArray::InitCheck() const
{
return fBitmap != NULL ? B_OK : B_NO_MEMORY;
}
bool
BitmapArray::IsSet(size_t index) const
{
uint32 byteIndex = index / 8;
if (byteIndex >= fSize)
return false;
return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0;
}
void
BitmapArray::Set(size_t index, bool set)
{
uint32 byteIndex = index / 8;
if (byteIndex >= fSize)
return;
if (set) {
fBitmap[byteIndex] |= 1UL << (index & 0x7);
fCountSet++;
} else {
fBitmap[byteIndex] &= ~(1UL << (index & 0x7));
fCountSet--;
}
}
#endif
bool
duplicate_array::_FindInternal(off_t value, int32& index) const
{
int32 min = 0, max = Count() - 1;
off_t cmp;
while (min <= max) {
index = (min + max) / 2;
cmp = ValueAt(index) - value;
if (cmp < 0)
min = index + 1;
else if (cmp > 0)
max = index - 1;
else
return true;
}
return false;
}
void
duplicate_array::Insert(off_t value)
{
int32 size = Count();
int32 i = 0;
if (size > 8 ) {
if (!_FindInternal(value, i) && ValueAt(i) <= value)
i++;
} else {
for (i = 0; i < size; i++) {
if (ValueAt(i) > value)
break;
}
}
memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t));
values[i] = HOST_ENDIAN_TO_BFS_INT64(value);
count = HOST_ENDIAN_TO_BFS_INT64(size + 1);
}
bool
duplicate_array::Remove(off_t value)
{
int32 index = Find(value);
if (index == -1)
return false;
int32 newSize = Count() - 1;
memmove(&values[index], &values[index + 1],
(newSize - index) * sizeof(off_t));
count = HOST_ENDIAN_TO_BFS_INT64(newSize);
return true;
}
#if _BOOT_MODE
}
#endif