#include "slab-depot.h"
#include <linux/atomic.h>
#include <linux/bio.h>
#include <linux/err.h>
#include <linux/log2.h>
#include <linux/min_heap.h>
#include <linux/minmax.h>
#include "logger.h"
#include "memory-alloc.h"
#include "numeric.h"
#include "permassert.h"
#include "string-utils.h"
#include "action-manager.h"
#include "admin-state.h"
#include "completion.h"
#include "constants.h"
#include "data-vio.h"
#include "encodings.h"
#include "io-submitter.h"
#include "physical-zone.h"
#include "priority-table.h"
#include "recovery-journal.h"
#include "repair.h"
#include "status-codes.h"
#include "types.h"
#include "vdo.h"
#include "vio.h"
#include "wait-queue.h"
static const u64 BYTES_PER_WORD = sizeof(u64);
static const bool NORMAL_OPERATION = true;
static inline struct journal_lock * __must_check get_lock(struct slab_journal *journal,
sequence_number_t sequence_number)
{
return &journal->locks[sequence_number % journal->size];
}
static bool is_slab_open(struct vdo_slab *slab)
{
return (!vdo_is_state_quiescing(&slab->state) &&
!vdo_is_state_quiescent(&slab->state));
}
static inline bool __must_check must_make_entries_to_flush(struct slab_journal *journal)
{
return ((journal->slab->status != VDO_SLAB_REBUILDING) &&
vdo_waitq_has_waiters(&journal->entry_waiters));
}
static inline bool __must_check is_reaping(struct slab_journal *journal)
{
return (journal->head != journal->unreapable);
}
static void initialize_tail_block(struct slab_journal *journal)
{
struct slab_journal_block_header *header = &journal->tail_header;
header->sequence_number = journal->tail;
header->entry_count = 0;
header->has_block_map_increments = false;
}
static void initialize_journal_state(struct slab_journal *journal)
{
journal->unreapable = journal->head;
journal->reap_lock = get_lock(journal, journal->unreapable);
journal->next_commit = journal->tail;
journal->summarized = journal->last_summarized = journal->tail;
initialize_tail_block(journal);
}
static bool __must_check block_is_full(struct slab_journal *journal)
{
journal_entry_count_t count = journal->tail_header.entry_count;
return (journal->tail_header.has_block_map_increments ?
(journal->full_entries_per_block == count) :
(journal->entries_per_block == count));
}
static void add_entries(struct slab_journal *journal);
static void update_tail_block_location(struct slab_journal *journal);
static void release_journal_locks(struct vdo_waiter *waiter, void *context);
static bool is_slab_journal_blank(const struct vdo_slab *slab)
{
return ((slab->journal.tail == 1) &&
(slab->journal.tail_header.entry_count == 0));
}
static void mark_slab_journal_dirty(struct slab_journal *journal, sequence_number_t lock)
{
struct slab_journal *dirty_journal;
struct list_head *dirty_list = &journal->slab->allocator->dirty_slab_journals;
VDO_ASSERT_LOG_ONLY(journal->recovery_lock == 0, "slab journal was clean");
journal->recovery_lock = lock;
list_for_each_entry_reverse(dirty_journal, dirty_list, dirty_entry) {
if (dirty_journal->recovery_lock <= journal->recovery_lock)
break;
}
list_move_tail(&journal->dirty_entry, dirty_journal->dirty_entry.next);
}
static void mark_slab_journal_clean(struct slab_journal *journal)
{
journal->recovery_lock = 0;
list_del_init(&journal->dirty_entry);
}
static void check_if_slab_drained(struct vdo_slab *slab)
{
bool read_only;
struct slab_journal *journal = &slab->journal;
const struct admin_state_code *code;
if (!vdo_is_state_draining(&slab->state) ||
must_make_entries_to_flush(journal) ||
is_reaping(journal) ||
journal->waiting_to_commit ||
!list_empty(&journal->uncommitted_blocks) ||
journal->updating_slab_summary ||
(slab->active_count > 0))
return;
code = vdo_get_admin_state_code(&slab->state);
read_only = vdo_is_read_only(slab->allocator->depot->vdo);
if (!read_only &&
vdo_waitq_has_waiters(&slab->dirty_blocks) &&
(code != VDO_ADMIN_STATE_SUSPENDING) &&
(code != VDO_ADMIN_STATE_RECOVERING))
return;
vdo_finish_draining_with_result(&slab->state,
(read_only ? VDO_READ_ONLY : VDO_SUCCESS));
}
static u8 __must_check compute_fullness_hint(struct slab_depot *depot,
block_count_t free_blocks)
{
block_count_t hint;
VDO_ASSERT_LOG_ONLY((free_blocks < (1 << 23)), "free blocks must be less than 2^23");
if (free_blocks == 0)
return 0;
hint = free_blocks >> depot->hint_shift;
return ((hint == 0) ? 1 : hint);
}
static void check_summary_drain_complete(struct block_allocator *allocator)
{
if (!vdo_is_state_draining(&allocator->summary_state) ||
(allocator->summary_write_count > 0))
return;
vdo_finish_operation(&allocator->summary_state,
(vdo_is_read_only(allocator->depot->vdo) ?
VDO_READ_ONLY : VDO_SUCCESS));
}
static void notify_summary_waiters(struct block_allocator *allocator,
struct vdo_wait_queue *queue)
{
int result = (vdo_is_read_only(allocator->depot->vdo) ?
VDO_READ_ONLY : VDO_SUCCESS);
vdo_waitq_notify_all_waiters(queue, NULL, &result);
}
static void launch_write(struct slab_summary_block *summary_block);
static void finish_updating_slab_summary_block(struct slab_summary_block *block)
{
notify_summary_waiters(block->allocator, &block->current_update_waiters);
block->writing = false;
block->allocator->summary_write_count--;
if (vdo_waitq_has_waiters(&block->next_update_waiters))
launch_write(block);
else
check_summary_drain_complete(block->allocator);
}
static void finish_update(struct vdo_completion *completion)
{
struct slab_summary_block *block =
container_of(as_vio(completion), struct slab_summary_block, vio);
atomic64_inc(&block->allocator->depot->summary_statistics.blocks_written);
finish_updating_slab_summary_block(block);
}
static void handle_write_error(struct vdo_completion *completion)
{
struct slab_summary_block *block =
container_of(as_vio(completion), struct slab_summary_block, vio);
vio_record_metadata_io_error(as_vio(completion));
vdo_enter_read_only_mode(completion->vdo, completion->result);
finish_updating_slab_summary_block(block);
}
static void write_slab_summary_endio(struct bio *bio)
{
struct vio *vio = bio->bi_private;
struct slab_summary_block *block =
container_of(vio, struct slab_summary_block, vio);
continue_vio_after_io(vio, finish_update, block->allocator->thread_id);
}
static void launch_write(struct slab_summary_block *block)
{
struct block_allocator *allocator = block->allocator;
struct slab_depot *depot = allocator->depot;
physical_block_number_t pbn;
if (block->writing)
return;
allocator->summary_write_count++;
vdo_waitq_transfer_all_waiters(&block->next_update_waiters,
&block->current_update_waiters);
block->writing = true;
if (vdo_is_read_only(depot->vdo)) {
finish_updating_slab_summary_block(block);
return;
}
memcpy(block->outgoing_entries, block->entries, VDO_BLOCK_SIZE);
pbn = (depot->summary_origin +
(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE * allocator->zone_number) +
block->index);
vdo_submit_metadata_vio(&block->vio, pbn, write_slab_summary_endio,
handle_write_error, REQ_OP_WRITE | REQ_PREFLUSH);
}
static void update_slab_summary_entry(struct vdo_slab *slab, struct vdo_waiter *waiter,
tail_block_offset_t tail_block_offset,
bool load_ref_counts, bool is_clean,
block_count_t free_blocks)
{
u8 index = slab->slab_number / VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK;
struct block_allocator *allocator = slab->allocator;
struct slab_summary_block *block = &allocator->summary_blocks[index];
int result;
struct slab_summary_entry *entry;
if (vdo_is_read_only(block->vio.completion.vdo)) {
result = VDO_READ_ONLY;
waiter->callback(waiter, &result);
return;
}
if (vdo_is_state_draining(&allocator->summary_state) ||
vdo_is_state_quiescent(&allocator->summary_state)) {
result = VDO_INVALID_ADMIN_STATE;
waiter->callback(waiter, &result);
return;
}
entry = &allocator->summary_entries[slab->slab_number];
*entry = (struct slab_summary_entry) {
.tail_block_offset = tail_block_offset,
.load_ref_counts = (entry->load_ref_counts || load_ref_counts),
.is_dirty = !is_clean,
.fullness_hint = compute_fullness_hint(allocator->depot, free_blocks),
};
vdo_waitq_enqueue_waiter(&block->next_update_waiters, waiter);
launch_write(block);
}
static void finish_reaping(struct slab_journal *journal)
{
journal->head = journal->unreapable;
add_entries(journal);
check_if_slab_drained(journal->slab);
}
static void reap_slab_journal(struct slab_journal *journal);
static void complete_reaping(struct vdo_completion *completion)
{
struct slab_journal *journal = completion->parent;
return_vio_to_pool(vio_as_pooled_vio(as_vio(completion)));
finish_reaping(journal);
reap_slab_journal(journal);
}
static void handle_flush_error(struct vdo_completion *completion)
{
vio_record_metadata_io_error(as_vio(completion));
vdo_enter_read_only_mode(completion->vdo, completion->result);
complete_reaping(completion);
}
static void flush_endio(struct bio *bio)
{
struct vio *vio = bio->bi_private;
struct slab_journal *journal = vio->completion.parent;
continue_vio_after_io(vio, complete_reaping,
journal->slab->allocator->thread_id);
}
static void flush_for_reaping(struct vdo_waiter *waiter, void *context)
{
struct slab_journal *journal =
container_of(waiter, struct slab_journal, flush_waiter);
struct pooled_vio *pooled = context;
struct vio *vio = &pooled->vio;
vio->completion.parent = journal;
vdo_submit_flush_vio(vio, flush_endio, handle_flush_error);
}
static void reap_slab_journal(struct slab_journal *journal)
{
bool reaped = false;
if (is_reaping(journal)) {
return;
}
if ((journal->slab->status != VDO_SLAB_REBUILT) ||
!vdo_is_state_normal(&journal->slab->state) ||
vdo_is_read_only(journal->slab->allocator->depot->vdo)) {
return;
}
while ((journal->unreapable < journal->tail) && (journal->reap_lock->count == 0)) {
reaped = true;
journal->unreapable++;
journal->reap_lock++;
if (journal->reap_lock == &journal->locks[journal->size])
journal->reap_lock = &journal->locks[0];
}
if (!reaped)
return;
journal->flush_waiter.callback = flush_for_reaping;
acquire_vio_from_pool(journal->slab->allocator->vio_pool,
&journal->flush_waiter);
}
static void adjust_slab_journal_block_reference(struct slab_journal *journal,
sequence_number_t sequence_number,
int adjustment)
{
struct journal_lock *lock;
if (sequence_number == 0)
return;
if (journal->slab->status == VDO_SLAB_REPLAYING) {
return;
}
VDO_ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero");
lock = get_lock(journal, sequence_number);
if (adjustment < 0) {
VDO_ASSERT_LOG_ONLY((-adjustment <= lock->count),
"adjustment %d of lock count %u for slab journal block %llu must not underflow",
adjustment, lock->count,
(unsigned long long) sequence_number);
}
lock->count += adjustment;
if (lock->count == 0)
reap_slab_journal(journal);
}
static void release_journal_locks(struct vdo_waiter *waiter, void *context)
{
sequence_number_t first, i;
struct slab_journal *journal =
container_of(waiter, struct slab_journal, slab_summary_waiter);
int result = *((int *) context);
if (result != VDO_SUCCESS) {
if (result != VDO_READ_ONLY) {
vdo_log_error_strerror(result, "failed slab summary update %llu",
(unsigned long long) journal->summarized);
}
journal->updating_slab_summary = false;
vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
check_if_slab_drained(journal->slab);
return;
}
if (journal->partial_write_in_progress && (journal->summarized == journal->tail)) {
journal->partial_write_in_progress = false;
add_entries(journal);
}
first = journal->last_summarized;
journal->last_summarized = journal->summarized;
for (i = journal->summarized - 1; i >= first; i--) {
if (journal->recovery_journal != NULL) {
zone_count_t zone_number = journal->slab->allocator->zone_number;
struct journal_lock *lock = get_lock(journal, i);
vdo_release_recovery_journal_block_reference(journal->recovery_journal,
lock->recovery_start,
VDO_ZONE_TYPE_PHYSICAL,
zone_number);
}
adjust_slab_journal_block_reference(journal, i, -1);
}
journal->updating_slab_summary = false;
reap_slab_journal(journal);
update_tail_block_location(journal);
}
static void update_tail_block_location(struct slab_journal *journal)
{
block_count_t free_block_count;
struct vdo_slab *slab = journal->slab;
if (journal->updating_slab_summary ||
vdo_is_read_only(journal->slab->allocator->depot->vdo) ||
(journal->last_summarized >= journal->next_commit)) {
check_if_slab_drained(slab);
return;
}
if (slab->status != VDO_SLAB_REBUILT) {
u8 hint = slab->allocator->summary_entries[slab->slab_number].fullness_hint;
free_block_count = ((block_count_t) hint) << slab->allocator->depot->hint_shift;
} else {
free_block_count = slab->free_blocks;
}
journal->summarized = journal->next_commit;
journal->updating_slab_summary = true;
update_slab_summary_entry(slab, &journal->slab_summary_waiter,
journal->summarized % journal->size,
(journal->head > 1), false, free_block_count);
}
static void reopen_slab_journal(struct vdo_slab *slab)
{
struct slab_journal *journal = &slab->journal;
sequence_number_t block;
VDO_ASSERT_LOG_ONLY(journal->tail_header.entry_count == 0,
"vdo_slab journal's active block empty before reopening");
journal->head = journal->tail;
initialize_journal_state(journal);
for (block = 1; block <= journal->size; block++) {
VDO_ASSERT_LOG_ONLY((get_lock(journal, block)->count == 0),
"Scrubbed journal's block %llu is not locked",
(unsigned long long) block);
}
add_entries(journal);
}
static sequence_number_t get_committing_sequence_number(const struct pooled_vio *vio)
{
const struct packed_slab_journal_block *block =
(const struct packed_slab_journal_block *) vio->vio.data;
return __le64_to_cpu(block->header.sequence_number);
}
static void complete_write(struct vdo_completion *completion)
{
int result = completion->result;
struct pooled_vio *pooled = vio_as_pooled_vio(as_vio(completion));
struct slab_journal *journal = completion->parent;
sequence_number_t committed = get_committing_sequence_number(pooled);
list_del_init(&pooled->list_entry);
return_vio_to_pool(pooled);
if (result != VDO_SUCCESS) {
vio_record_metadata_io_error(as_vio(completion));
vdo_log_error_strerror(result, "cannot write slab journal block %llu",
(unsigned long long) committed);
vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
check_if_slab_drained(journal->slab);
return;
}
WRITE_ONCE(journal->events->blocks_written, journal->events->blocks_written + 1);
if (list_empty(&journal->uncommitted_blocks)) {
journal->next_commit = journal->tail;
} else {
pooled = container_of(journal->uncommitted_blocks.next,
struct pooled_vio, list_entry);
journal->next_commit = get_committing_sequence_number(pooled);
}
update_tail_block_location(journal);
}
static void write_slab_journal_endio(struct bio *bio)
{
struct vio *vio = bio->bi_private;
struct slab_journal *journal = vio->completion.parent;
continue_vio_after_io(vio, complete_write, journal->slab->allocator->thread_id);
}
static void write_slab_journal_block(struct vdo_waiter *waiter, void *context)
{
struct pooled_vio *pooled = context;
struct vio *vio = &pooled->vio;
struct slab_journal *journal =
container_of(waiter, struct slab_journal, resource_waiter);
struct slab_journal_block_header *header = &journal->tail_header;
int unused_entries = journal->entries_per_block - header->entry_count;
physical_block_number_t block_number;
const struct admin_state_code *operation;
header->head = journal->head;
list_add_tail(&pooled->list_entry, &journal->uncommitted_blocks);
vdo_pack_slab_journal_block_header(header, &journal->block->header);
memcpy(pooled->vio.data, journal->block, VDO_BLOCK_SIZE);
VDO_ASSERT_LOG_ONLY(unused_entries >= 0, "vdo_slab journal block is not overfull");
if (unused_entries > 0) {
adjust_slab_journal_block_reference(journal, header->sequence_number,
-unused_entries);
journal->partial_write_in_progress = !block_is_full(journal);
}
block_number = journal->slab->journal_origin +
(header->sequence_number % journal->size);
vio->completion.parent = journal;
vdo_submit_metadata_vio(vdo_forget(vio), block_number, write_slab_journal_endio,
complete_write, REQ_OP_WRITE);
journal->tail++;
initialize_tail_block(journal);
journal->waiting_to_commit = false;
operation = vdo_get_admin_state_code(&journal->slab->state);
if (operation == VDO_ADMIN_STATE_WAITING_FOR_RECOVERY) {
vdo_finish_operation(&journal->slab->state,
(vdo_is_read_only(journal->slab->allocator->depot->vdo) ?
VDO_READ_ONLY : VDO_SUCCESS));
return;
}
add_entries(journal);
}
static void commit_tail(struct slab_journal *journal)
{
if ((journal->tail_header.entry_count == 0) && must_make_entries_to_flush(journal)) {
return;
}
if (vdo_is_read_only(journal->slab->allocator->depot->vdo) ||
journal->waiting_to_commit ||
(journal->tail_header.entry_count == 0)) {
return;
}
mark_slab_journal_clean(journal);
journal->waiting_to_commit = true;
journal->resource_waiter.callback = write_slab_journal_block;
acquire_vio_from_pool(journal->slab->allocator->vio_pool,
&journal->resource_waiter);
}
static void encode_slab_journal_entry(struct slab_journal_block_header *tail_header,
slab_journal_payload *payload,
slab_block_number sbn,
enum journal_operation operation,
bool increment)
{
journal_entry_count_t entry_number = tail_header->entry_count++;
if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
if (!tail_header->has_block_map_increments) {
memset(payload->full_entries.entry_types, 0,
VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE);
tail_header->has_block_map_increments = true;
}
payload->full_entries.entry_types[entry_number / 8] |=
((u8)1 << (entry_number % 8));
}
vdo_pack_slab_journal_entry(&payload->entries[entry_number], sbn, increment);
}
static struct journal_point expand_journal_point(struct journal_point recovery_point,
bool increment)
{
recovery_point.entry_count *= 2;
if (!increment)
recovery_point.entry_count++;
return recovery_point;
}
static void add_entry(struct slab_journal *journal, physical_block_number_t pbn,
enum journal_operation operation, bool increment,
struct journal_point recovery_point)
{
struct packed_slab_journal_block *block = journal->block;
int result;
result = VDO_ASSERT(vdo_before_journal_point(&journal->tail_header.recovery_point,
&recovery_point),
"recovery journal point is monotonically increasing, recovery point: %llu.%u, block recovery point: %llu.%u",
(unsigned long long) recovery_point.sequence_number,
recovery_point.entry_count,
(unsigned long long) journal->tail_header.recovery_point.sequence_number,
journal->tail_header.recovery_point.entry_count);
if (result != VDO_SUCCESS) {
vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
return;
}
if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
result = VDO_ASSERT((journal->tail_header.entry_count <
journal->full_entries_per_block),
"block has room for full entries");
if (result != VDO_SUCCESS) {
vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo,
result);
return;
}
}
encode_slab_journal_entry(&journal->tail_header, &block->payload,
pbn - journal->slab->start, operation, increment);
journal->tail_header.recovery_point = recovery_point;
if (block_is_full(journal))
commit_tail(journal);
}
static inline block_count_t journal_length(const struct slab_journal *journal)
{
return journal->tail - journal->head;
}
bool vdo_attempt_replay_into_slab(struct vdo_slab *slab, physical_block_number_t pbn,
enum journal_operation operation, bool increment,
struct journal_point *recovery_point,
struct vdo_completion *parent)
{
struct slab_journal *journal = &slab->journal;
struct slab_journal_block_header *header = &journal->tail_header;
struct journal_point expanded = expand_journal_point(*recovery_point, increment);
if (!vdo_before_journal_point(&journal->tail_header.recovery_point, &expanded))
return true;
if ((header->entry_count >= journal->full_entries_per_block) &&
(header->has_block_map_increments || (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING))) {
commit_tail(journal);
}
if (journal->waiting_to_commit) {
vdo_start_operation_with_waiter(&journal->slab->state,
VDO_ADMIN_STATE_WAITING_FOR_RECOVERY,
parent, NULL);
return false;
}
if (journal_length(journal) >= journal->size) {
journal->head++;
journal->unreapable++;
}
if (journal->slab->status == VDO_SLAB_REBUILT)
journal->slab->status = VDO_SLAB_REPLAYING;
add_entry(journal, pbn, operation, increment, expanded);
return true;
}
static bool requires_reaping(const struct slab_journal *journal)
{
return (journal_length(journal) >= journal->blocking_threshold);
}
static void finish_summary_update(struct vdo_waiter *waiter, void *context)
{
struct vdo_slab *slab = container_of(waiter, struct vdo_slab, summary_waiter);
int result = *((int *) context);
slab->active_count--;
if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
vdo_log_error_strerror(result, "failed to update slab summary");
vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
}
check_if_slab_drained(slab);
}
static void write_reference_block(struct vdo_waiter *waiter, void *context);
static void launch_reference_block_write(struct vdo_waiter *waiter, void *context)
{
struct vdo_slab *slab = context;
if (vdo_is_read_only(slab->allocator->depot->vdo))
return;
slab->active_count++;
container_of(waiter, struct reference_block, waiter)->is_writing = true;
waiter->callback = write_reference_block;
acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
}
static void save_dirty_reference_blocks(struct vdo_slab *slab)
{
vdo_waitq_notify_all_waiters(&slab->dirty_blocks,
launch_reference_block_write, slab);
check_if_slab_drained(slab);
}
static void finish_reference_block_write(struct vdo_completion *completion)
{
struct vio *vio = as_vio(completion);
struct pooled_vio *pooled = vio_as_pooled_vio(vio);
struct reference_block *block = completion->parent;
struct vdo_slab *slab = block->slab;
tail_block_offset_t offset;
slab->active_count--;
adjust_slab_journal_block_reference(&slab->journal,
block->slab_journal_lock_to_release, -1);
return_vio_to_pool(pooled);
block->is_writing = false;
if (vdo_is_read_only(completion->vdo)) {
check_if_slab_drained(slab);
return;
}
if (block->is_dirty) {
vdo_waitq_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter);
if (vdo_is_state_draining(&slab->state)) {
save_dirty_reference_blocks(slab);
}
return;
}
if ((slab->active_count > 0) || vdo_waitq_has_waiters(&slab->dirty_blocks)) {
check_if_slab_drained(slab);
return;
}
offset = slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
slab->active_count++;
slab->summary_waiter.callback = finish_summary_update;
update_slab_summary_entry(slab, &slab->summary_waiter, offset,
true, true, slab->free_blocks);
}
static vdo_refcount_t * __must_check get_reference_counters_for_block(struct reference_block *block)
{
size_t block_index = block - block->slab->reference_blocks;
return &block->slab->counters[block_index * COUNTS_PER_BLOCK];
}
static void pack_reference_block(struct reference_block *block, void *buffer)
{
struct packed_reference_block *packed = buffer;
vdo_refcount_t *counters = get_reference_counters_for_block(block);
sector_count_t i;
struct packed_journal_point commit_point;
vdo_pack_journal_point(&block->slab->slab_journal_point, &commit_point);
for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) {
packed->sectors[i].commit_point = commit_point;
memcpy(packed->sectors[i].counts, counters + (i * COUNTS_PER_SECTOR),
(sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR));
}
}
static void write_reference_block_endio(struct bio *bio)
{
struct vio *vio = bio->bi_private;
struct reference_block *block = vio->completion.parent;
thread_id_t thread_id = block->slab->allocator->thread_id;
continue_vio_after_io(vio, finish_reference_block_write, thread_id);
}
static void handle_io_error(struct vdo_completion *completion)
{
int result = completion->result;
struct vio *vio = as_vio(completion);
struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab;
vio_record_metadata_io_error(vio);
return_vio_to_pool(vio_as_pooled_vio(vio));
slab->active_count -= vio->io_size / VDO_BLOCK_SIZE;
vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
check_if_slab_drained(slab);
}
static void write_reference_block(struct vdo_waiter *waiter, void *context)
{
size_t block_offset;
physical_block_number_t pbn;
struct pooled_vio *pooled = context;
struct vdo_completion *completion = &pooled->vio.completion;
struct reference_block *block = container_of(waiter, struct reference_block,
waiter);
pack_reference_block(block, pooled->vio.data);
block_offset = (block - block->slab->reference_blocks);
pbn = (block->slab->ref_counts_origin + block_offset);
block->slab_journal_lock_to_release = block->slab_journal_lock;
completion->parent = block;
block->is_dirty = false;
WRITE_ONCE(block->slab->allocator->ref_counts_statistics.blocks_written,
block->slab->allocator->ref_counts_statistics.blocks_written + 1);
completion->callback_thread_id = ((struct block_allocator *) pooled->context)->thread_id;
vdo_submit_metadata_vio(&pooled->vio, pbn, write_reference_block_endio,
handle_io_error, REQ_OP_WRITE | REQ_PREFLUSH);
}
static void reclaim_journal_space(struct slab_journal *journal)
{
block_count_t length = journal_length(journal);
struct vdo_slab *slab = journal->slab;
block_count_t write_count = vdo_waitq_num_waiters(&slab->dirty_blocks);
block_count_t written;
if ((length < journal->flushing_threshold) || (write_count == 0))
return;
WRITE_ONCE(journal->events->flush_count, journal->events->flush_count + 1);
if (length < journal->flushing_deadline) {
write_count /= journal->flushing_deadline - length + 1;
write_count = max_t(block_count_t, write_count, 1);
}
for (written = 0; written < write_count; written++) {
vdo_waitq_notify_next_waiter(&slab->dirty_blocks,
launch_reference_block_write, slab);
}
}
static enum reference_status __must_check reference_count_to_status(vdo_refcount_t count)
{
if (count == EMPTY_REFERENCE_COUNT)
return RS_FREE;
else if (count == 1)
return RS_SINGLE;
else if (count == PROVISIONAL_REFERENCE_COUNT)
return RS_PROVISIONAL;
else
return RS_SHARED;
}
static void dirty_block(struct reference_block *block)
{
if (block->is_dirty)
return;
block->is_dirty = true;
if (!block->is_writing)
vdo_waitq_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter);
}
static struct reference_block * __must_check get_reference_block(struct vdo_slab *slab,
slab_block_number index)
{
return &slab->reference_blocks[index / COUNTS_PER_BLOCK];
}
static int __must_check slab_block_number_from_pbn(struct vdo_slab *slab,
physical_block_number_t pbn,
slab_block_number *slab_block_number_ptr)
{
u64 slab_block_number;
if (pbn < slab->start)
return VDO_OUT_OF_RANGE;
slab_block_number = pbn - slab->start;
if (slab_block_number >= slab->allocator->depot->slab_config.data_blocks)
return VDO_OUT_OF_RANGE;
*slab_block_number_ptr = slab_block_number;
return VDO_SUCCESS;
}
static int __must_check get_reference_counter(struct vdo_slab *slab,
physical_block_number_t pbn,
vdo_refcount_t **counter_ptr)
{
slab_block_number index;
int result = slab_block_number_from_pbn(slab, pbn, &index);
if (result != VDO_SUCCESS)
return result;
*counter_ptr = &slab->counters[index];
return VDO_SUCCESS;
}
static unsigned int calculate_slab_priority(struct vdo_slab *slab)
{
block_count_t free_blocks = slab->free_blocks;
unsigned int unopened_slab_priority = slab->allocator->unopened_slab_priority;
unsigned int priority;
if (free_blocks == 0)
return 0;
if (is_slab_journal_blank(slab))
return unopened_slab_priority;
priority = (1 + ilog2(free_blocks));
return ((priority < unopened_slab_priority) ? priority : priority + 1);
}
static void prioritize_slab(struct vdo_slab *slab)
{
VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
"a slab must not already be on a list when prioritizing");
slab->priority = calculate_slab_priority(slab);
vdo_priority_table_enqueue(slab->allocator->prioritized_slabs,
slab->priority, &slab->allocq_entry);
}
static void adjust_free_block_count(struct vdo_slab *slab, bool incremented)
{
struct block_allocator *allocator = slab->allocator;
WRITE_ONCE(allocator->allocated_blocks,
allocator->allocated_blocks + (incremented ? -1 : 1));
if (slab == allocator->open_slab)
return;
if (slab->priority == calculate_slab_priority(slab))
return;
vdo_priority_table_remove(allocator->prioritized_slabs, &slab->allocq_entry);
prioritize_slab(slab);
}
static int increment_for_data(struct vdo_slab *slab, struct reference_block *block,
slab_block_number block_number,
enum reference_status old_status,
struct pbn_lock *lock, vdo_refcount_t *counter_ptr,
bool adjust_block_count)
{
switch (old_status) {
case RS_FREE:
*counter_ptr = 1;
block->allocated_count++;
slab->free_blocks--;
if (adjust_block_count)
adjust_free_block_count(slab, false);
break;
case RS_PROVISIONAL:
*counter_ptr = 1;
break;
default:
if (*counter_ptr >= MAXIMUM_REFERENCE_COUNT) {
return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
"Incrementing a block already having 254 references (slab %u, offset %u)",
slab->slab_number, block_number);
}
(*counter_ptr)++;
}
if (lock != NULL)
vdo_unassign_pbn_lock_provisional_reference(lock);
return VDO_SUCCESS;
}
static int decrement_for_data(struct vdo_slab *slab, struct reference_block *block,
slab_block_number block_number,
enum reference_status old_status,
struct reference_updater *updater,
vdo_refcount_t *counter_ptr, bool adjust_block_count)
{
switch (old_status) {
case RS_FREE:
return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
"Decrementing free block at offset %u in slab %u",
block_number, slab->slab_number);
case RS_PROVISIONAL:
case RS_SINGLE:
if (updater->zpbn.zone != NULL) {
struct pbn_lock *lock = vdo_get_physical_zone_pbn_lock(updater->zpbn.zone,
updater->zpbn.pbn);
if (lock != NULL) {
*counter_ptr = PROVISIONAL_REFERENCE_COUNT;
vdo_assign_pbn_lock_provisional_reference(lock);
break;
}
}
*counter_ptr = EMPTY_REFERENCE_COUNT;
block->allocated_count--;
slab->free_blocks++;
if (adjust_block_count)
adjust_free_block_count(slab, true);
break;
default:
(*counter_ptr)--;
}
return VDO_SUCCESS;
}
static int increment_for_block_map(struct vdo_slab *slab, struct reference_block *block,
slab_block_number block_number,
enum reference_status old_status,
struct pbn_lock *lock, bool normal_operation,
vdo_refcount_t *counter_ptr, bool adjust_block_count)
{
switch (old_status) {
case RS_FREE:
if (normal_operation) {
return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
"Incrementing unallocated block map block (slab %u, offset %u)",
slab->slab_number, block_number);
}
*counter_ptr = MAXIMUM_REFERENCE_COUNT;
block->allocated_count++;
slab->free_blocks--;
if (adjust_block_count)
adjust_free_block_count(slab, false);
return VDO_SUCCESS;
case RS_PROVISIONAL:
if (!normal_operation)
return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
"Block map block had provisional reference during replay (slab %u, offset %u)",
slab->slab_number, block_number);
*counter_ptr = MAXIMUM_REFERENCE_COUNT;
if (lock != NULL)
vdo_unassign_pbn_lock_provisional_reference(lock);
return VDO_SUCCESS;
default:
return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
"Incrementing a block map block which is already referenced %u times (slab %u, offset %u)",
*counter_ptr, slab->slab_number,
block_number);
}
}
static bool __must_check is_valid_journal_point(const struct journal_point *point)
{
return ((point != NULL) && (point->sequence_number > 0));
}
static int update_reference_count(struct vdo_slab *slab, struct reference_block *block,
slab_block_number block_number,
const struct journal_point *slab_journal_point,
struct reference_updater *updater,
bool normal_operation, bool adjust_block_count,
bool *provisional_decrement_ptr)
{
vdo_refcount_t *counter_ptr = &slab->counters[block_number];
enum reference_status old_status = reference_count_to_status(*counter_ptr);
int result;
if (!updater->increment) {
result = decrement_for_data(slab, block, block_number, old_status,
updater, counter_ptr, adjust_block_count);
if ((result == VDO_SUCCESS) && (old_status == RS_PROVISIONAL)) {
if (provisional_decrement_ptr != NULL)
*provisional_decrement_ptr = true;
return VDO_SUCCESS;
}
} else if (updater->operation == VDO_JOURNAL_DATA_REMAPPING) {
result = increment_for_data(slab, block, block_number, old_status,
updater->lock, counter_ptr, adjust_block_count);
} else {
result = increment_for_block_map(slab, block, block_number, old_status,
updater->lock, normal_operation,
counter_ptr, adjust_block_count);
}
if (result != VDO_SUCCESS)
return result;
if (is_valid_journal_point(slab_journal_point))
slab->slab_journal_point = *slab_journal_point;
return VDO_SUCCESS;
}
static int __must_check adjust_reference_count(struct vdo_slab *slab,
struct reference_updater *updater,
const struct journal_point *slab_journal_point)
{
slab_block_number block_number;
int result;
struct reference_block *block;
bool provisional_decrement = false;
if (!is_slab_open(slab))
return VDO_INVALID_ADMIN_STATE;
result = slab_block_number_from_pbn(slab, updater->zpbn.pbn, &block_number);
if (result != VDO_SUCCESS)
return result;
block = get_reference_block(slab, block_number);
result = update_reference_count(slab, block, block_number, slab_journal_point,
updater, NORMAL_OPERATION, true,
&provisional_decrement);
if ((result != VDO_SUCCESS) || provisional_decrement)
return result;
if (block->is_dirty && (block->slab_journal_lock > 0)) {
sequence_number_t entry_lock = slab_journal_point->sequence_number;
result = VDO_ASSERT(is_valid_journal_point(slab_journal_point),
"Reference count adjustments need slab journal points.");
if (result != VDO_SUCCESS)
return result;
adjust_slab_journal_block_reference(&slab->journal, entry_lock, -1);
return VDO_SUCCESS;
}
if (is_valid_journal_point(slab_journal_point))
block->slab_journal_lock = slab_journal_point->sequence_number;
else
block->slab_journal_lock = 0;
dirty_block(block);
return VDO_SUCCESS;
}
static void add_entry_from_waiter(struct vdo_waiter *waiter, void *context)
{
int result;
struct reference_updater *updater =
container_of(waiter, struct reference_updater, waiter);
struct data_vio *data_vio = data_vio_from_reference_updater(updater);
struct slab_journal *journal = context;
struct slab_journal_block_header *header = &journal->tail_header;
struct journal_point slab_journal_point = {
.sequence_number = header->sequence_number,
.entry_count = header->entry_count,
};
sequence_number_t recovery_block = data_vio->recovery_journal_point.sequence_number;
if (header->entry_count == 0) {
get_lock(journal, header->sequence_number)->recovery_start = recovery_block;
if (journal->recovery_journal != NULL) {
zone_count_t zone_number = journal->slab->allocator->zone_number;
vdo_acquire_recovery_journal_block_reference(journal->recovery_journal,
recovery_block,
VDO_ZONE_TYPE_PHYSICAL,
zone_number);
}
mark_slab_journal_dirty(journal, recovery_block);
reclaim_journal_space(journal);
}
add_entry(journal, updater->zpbn.pbn, updater->operation, updater->increment,
expand_journal_point(data_vio->recovery_journal_point,
updater->increment));
if (journal->slab->status != VDO_SLAB_REBUILT) {
adjust_slab_journal_block_reference(journal,
slab_journal_point.sequence_number, -1);
result = VDO_SUCCESS;
} else {
result = adjust_reference_count(journal->slab, updater,
&slab_journal_point);
}
if (updater->increment)
continue_data_vio_with_error(data_vio, result);
else
vdo_continue_completion(&data_vio->decrement_completion, result);
}
static inline bool is_next_entry_a_block_map_increment(struct slab_journal *journal)
{
struct vdo_waiter *waiter = vdo_waitq_get_first_waiter(&journal->entry_waiters);
struct reference_updater *updater =
container_of(waiter, struct reference_updater, waiter);
return (updater->operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING);
}
static void add_entries(struct slab_journal *journal)
{
if (journal->adding_entries) {
return;
}
journal->adding_entries = true;
while (vdo_waitq_has_waiters(&journal->entry_waiters)) {
struct slab_journal_block_header *header = &journal->tail_header;
if (journal->partial_write_in_progress ||
(journal->slab->status == VDO_SLAB_REBUILDING)) {
break;
}
if (journal->waiting_to_commit) {
WRITE_ONCE(journal->events->tail_busy_count,
journal->events->tail_busy_count + 1);
break;
} else if (is_next_entry_a_block_map_increment(journal) &&
(header->entry_count >= journal->full_entries_per_block)) {
commit_tail(journal);
if (journal->waiting_to_commit) {
WRITE_ONCE(journal->events->tail_busy_count,
journal->events->tail_busy_count + 1);
break;
}
}
if (requires_reaping(journal)) {
WRITE_ONCE(journal->events->blocked_count,
journal->events->blocked_count + 1);
save_dirty_reference_blocks(journal->slab);
break;
}
if (header->entry_count == 0) {
struct journal_lock *lock =
get_lock(journal, header->sequence_number);
if (lock->count > 0) {
VDO_ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail,
"New block has locks, but journal is not full");
VDO_ASSERT_LOG_ONLY((journal->blocking_threshold >= journal->size),
"New block can have locks already iff blocking threshold is at the end of the journal");
WRITE_ONCE(journal->events->disk_full_count,
journal->events->disk_full_count + 1);
save_dirty_reference_blocks(journal->slab);
break;
}
lock->count = journal->entries_per_block + 1;
if (header->sequence_number == 1) {
struct vdo_slab *slab = journal->slab;
block_count_t i;
for (i = 0; i < slab->reference_block_count; i++) {
slab->reference_blocks[i].slab_journal_lock = 1;
dirty_block(&slab->reference_blocks[i]);
}
adjust_slab_journal_block_reference(journal, 1,
slab->reference_block_count);
}
}
vdo_waitq_notify_next_waiter(&journal->entry_waiters,
add_entry_from_waiter, journal);
}
journal->adding_entries = false;
if (vdo_is_state_draining(&journal->slab->state) &&
!vdo_is_state_suspending(&journal->slab->state) &&
!vdo_waitq_has_waiters(&journal->entry_waiters))
commit_tail(journal);
}
static void reset_search_cursor(struct vdo_slab *slab)
{
struct search_cursor *cursor = &slab->search_cursor;
cursor->block = cursor->first_block;
cursor->index = 0;
cursor->end_index = min_t(u32, COUNTS_PER_BLOCK, slab->block_count);
}
static bool advance_search_cursor(struct vdo_slab *slab)
{
struct search_cursor *cursor = &slab->search_cursor;
if (cursor->block == cursor->last_block) {
reset_search_cursor(slab);
return false;
}
cursor->block++;
cursor->index = cursor->end_index;
if (cursor->block == cursor->last_block) {
cursor->end_index = slab->block_count;
} else {
cursor->end_index += COUNTS_PER_BLOCK;
}
return true;
}
int vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot,
physical_block_number_t pbn,
enum journal_operation operation)
{
int result;
slab_block_number block_number;
struct reference_block *block;
struct vdo_slab *slab = vdo_get_slab(depot, pbn);
struct reference_updater updater = {
.operation = operation,
.increment = true,
};
result = slab_block_number_from_pbn(slab, pbn, &block_number);
if (result != VDO_SUCCESS)
return result;
block = get_reference_block(slab, block_number);
result = update_reference_count(slab, block, block_number, NULL,
&updater, !NORMAL_OPERATION, false, NULL);
if (result != VDO_SUCCESS)
return result;
dirty_block(block);
return VDO_SUCCESS;
}
static int replay_reference_count_change(struct vdo_slab *slab,
const struct journal_point *entry_point,
struct slab_journal_entry entry)
{
int result;
struct reference_block *block = get_reference_block(slab, entry.sbn);
sector_count_t sector = (entry.sbn % COUNTS_PER_BLOCK) / COUNTS_PER_SECTOR;
struct reference_updater updater = {
.operation = entry.operation,
.increment = entry.increment,
};
if (!vdo_before_journal_point(&block->commit_points[sector], entry_point)) {
return VDO_SUCCESS;
}
result = update_reference_count(slab, block, entry.sbn, entry_point,
&updater, !NORMAL_OPERATION, false, NULL);
if (result != VDO_SUCCESS)
return result;
dirty_block(block);
return VDO_SUCCESS;
}
static inline slab_block_number find_zero_byte_in_word(const u8 *word_ptr,
slab_block_number start_index,
slab_block_number fail_index)
{
u64 word = get_unaligned_le64(word_ptr);
unsigned int offset;
for (offset = 0; offset < BYTES_PER_WORD; offset++) {
if ((word & 0xFF) == 0)
return (start_index + offset);
word >>= 8;
}
return fail_index;
}
static bool find_free_block(const struct vdo_slab *slab, slab_block_number *index_ptr)
{
slab_block_number zero_index;
slab_block_number next_index = slab->search_cursor.index;
slab_block_number end_index = slab->search_cursor.end_index;
u8 *next_counter = &slab->counters[next_index];
u8 *end_counter = &slab->counters[end_index];
zero_index = find_zero_byte_in_word(next_counter, next_index, end_index);
if (zero_index < end_index) {
*index_ptr = zero_index;
return true;
}
next_index += BYTES_PER_WORD;
next_counter += BYTES_PER_WORD;
while (next_counter < end_counter) {
zero_index = find_zero_byte_in_word(next_counter, next_index, end_index);
if (zero_index < end_index) {
*index_ptr = zero_index;
return true;
}
next_index += BYTES_PER_WORD;
next_counter += BYTES_PER_WORD;
}
return false;
}
static bool search_current_reference_block(const struct vdo_slab *slab,
slab_block_number *free_index_ptr)
{
return ((slab->search_cursor.block->allocated_count < COUNTS_PER_BLOCK) &&
find_free_block(slab, free_index_ptr));
}
static bool search_reference_blocks(struct vdo_slab *slab,
slab_block_number *free_index_ptr)
{
if (search_current_reference_block(slab, free_index_ptr))
return true;
while (advance_search_cursor(slab)) {
if (search_current_reference_block(slab, free_index_ptr))
return true;
}
return false;
}
static void make_provisional_reference(struct vdo_slab *slab,
slab_block_number block_number)
{
struct reference_block *block = get_reference_block(slab, block_number);
slab->counters[block_number] = PROVISIONAL_REFERENCE_COUNT;
block->allocated_count++;
slab->free_blocks--;
}
static void dirty_all_reference_blocks(struct vdo_slab *slab)
{
block_count_t i;
for (i = 0; i < slab->reference_block_count; i++)
dirty_block(&slab->reference_blocks[i]);
}
static inline bool journal_points_equal(struct journal_point first,
struct journal_point second)
{
return ((first.sequence_number == second.sequence_number) &&
(first.entry_count == second.entry_count));
}
static inline u64 match_bytes(u64 input, u8 match)
{
u64 temp = input ^ (match * 0x0101010101010101ULL);
u64 test_top_bits = ~temp & 0x8080808080808080ULL;
u64 test_low_bits = 0x8080808080808080ULL - (temp & 0x7f7f7f7f7f7f7f7fULL);
return (test_top_bits & test_low_bits) >> 7;
}
static unsigned int count_valid_references(vdo_refcount_t *counters)
{
u64 *words = (u64 *)counters;
unsigned int empty_count = 0;
unsigned int words_left = COUNTS_PER_BLOCK / sizeof(u64);
BUILD_BUG_ON(sizeof(vdo_refcount_t) != 1);
BUILD_BUG_ON((COUNTS_PER_BLOCK % sizeof(u64)) != 0);
while (words_left > 0) {
u64 split_count = 0;
const unsigned int max_words_per_iteration = 254 / sizeof(u64);
unsigned int iter_words_left = min_t(unsigned int, words_left,
max_words_per_iteration);
words_left -= iter_words_left;
while (iter_words_left--) {
u64 word = *words;
u64 temp;
temp = match_bytes(word, PROVISIONAL_REFERENCE_COUNT);
if (temp) {
word ^= temp * (PROVISIONAL_REFERENCE_COUNT ^ EMPTY_REFERENCE_COUNT);
*words = word;
}
split_count += match_bytes(word, EMPTY_REFERENCE_COUNT);
words++;
}
empty_count += split_count % 255;
}
return COUNTS_PER_BLOCK - empty_count;
}
static void unpack_reference_block(struct packed_reference_block *packed,
struct reference_block *block)
{
sector_count_t i;
struct vdo_slab *slab = block->slab;
vdo_refcount_t *counters = get_reference_counters_for_block(block);
for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) {
struct packed_reference_sector *sector = &packed->sectors[i];
vdo_unpack_journal_point(§or->commit_point, &block->commit_points[i]);
memcpy(counters + (i * COUNTS_PER_SECTOR), sector->counts,
(sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR));
if (vdo_before_journal_point(&slab->slab_journal_point,
&block->commit_points[i]))
slab->slab_journal_point = block->commit_points[i];
if ((i > 0) &&
!journal_points_equal(block->commit_points[0],
block->commit_points[i])) {
size_t block_index = block - block->slab->reference_blocks;
vdo_log_warning("Torn write detected in sector %u of reference block %zu of slab %u",
i, block_index, block->slab->slab_number);
}
}
block->allocated_count = count_valid_references(counters);
}
static void finish_reference_block_load(struct vdo_completion *completion)
{
struct vio *vio = as_vio(completion);
struct pooled_vio *pooled = vio_as_pooled_vio(vio);
struct reference_block *block = completion->parent;
struct vdo_slab *slab = block->slab;
unsigned int block_count = vio->io_size / VDO_BLOCK_SIZE;
unsigned int i;
char *data = vio->data;
for (i = 0; i < block_count; i++, block++, data += VDO_BLOCK_SIZE) {
struct packed_reference_block *packed = (struct packed_reference_block *) data;
unpack_reference_block(packed, block);
slab->free_blocks -= block->allocated_count;
}
return_vio_to_pool(pooled);
slab->active_count -= block_count;
check_if_slab_drained(slab);
}
static void load_reference_block_endio(struct bio *bio)
{
struct vio *vio = bio->bi_private;
struct reference_block *block = vio->completion.parent;
continue_vio_after_io(vio, finish_reference_block_load,
block->slab->allocator->thread_id);
}
static void load_reference_block_group(struct vdo_waiter *waiter, void *context)
{
struct pooled_vio *pooled = context;
struct vio *vio = &pooled->vio;
struct reference_block *block =
container_of(waiter, struct reference_block, waiter);
u32 block_offset = block - block->slab->reference_blocks;
u32 max_block_count = block->slab->reference_block_count - block_offset;
u32 block_count = min_t(int, vio->block_count, max_block_count);
vio->completion.parent = block;
vdo_submit_metadata_vio_with_size(vio, block->slab->ref_counts_origin + block_offset,
load_reference_block_endio, handle_io_error,
REQ_OP_READ, block_count * VDO_BLOCK_SIZE);
}
static void load_reference_blocks(struct vdo_slab *slab)
{
block_count_t i;
u64 blocks_per_vio = slab->allocator->refcount_blocks_per_big_vio;
struct vio_pool *pool = slab->allocator->refcount_big_vio_pool;
if (!pool) {
pool = slab->allocator->vio_pool;
blocks_per_vio = 1;
}
slab->free_blocks = slab->block_count;
slab->active_count = slab->reference_block_count;
for (i = 0; i < slab->reference_block_count; i += blocks_per_vio) {
struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter;
waiter->callback = load_reference_block_group;
acquire_vio_from_pool(pool, waiter);
}
}
static void drain_slab(struct vdo_slab *slab)
{
bool save;
bool load;
const struct admin_state_code *state = vdo_get_admin_state_code(&slab->state);
if (state == VDO_ADMIN_STATE_SUSPENDING)
return;
if ((state != VDO_ADMIN_STATE_REBUILDING) &&
(state != VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING))
commit_tail(&slab->journal);
if ((state == VDO_ADMIN_STATE_RECOVERING) || (slab->counters == NULL))
return;
save = false;
load = slab->allocator->summary_entries[slab->slab_number].load_ref_counts;
if (state == VDO_ADMIN_STATE_SCRUBBING) {
if (load) {
load_reference_blocks(slab);
return;
}
} else if (state == VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING) {
if (!load) {
dirty_all_reference_blocks(slab);
}
save = true;
} else if (state == VDO_ADMIN_STATE_REBUILDING) {
block_count_t data_blocks = slab->allocator->depot->slab_config.data_blocks;
if (load || (slab->free_blocks != data_blocks) ||
!is_slab_journal_blank(slab)) {
dirty_all_reference_blocks(slab);
save = true;
}
} else if (state == VDO_ADMIN_STATE_SAVING) {
save = (slab->status == VDO_SLAB_REBUILT);
} else {
vdo_finish_draining_with_result(&slab->state, VDO_SUCCESS);
return;
}
if (save)
save_dirty_reference_blocks(slab);
}
static int allocate_slab_counters(struct vdo_slab *slab)
{
int result;
size_t index, bytes;
result = VDO_ASSERT(slab->reference_blocks == NULL,
"vdo_slab %u doesn't allocate refcounts twice",
slab->slab_number);
if (result != VDO_SUCCESS)
return result;
result = vdo_allocate(slab->reference_block_count, struct reference_block,
__func__, &slab->reference_blocks);
if (result != VDO_SUCCESS)
return result;
bytes = (slab->reference_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD);
result = vdo_allocate(bytes, vdo_refcount_t, "ref counts array",
&slab->counters);
if (result != VDO_SUCCESS) {
vdo_free(vdo_forget(slab->reference_blocks));
return result;
}
slab->search_cursor.first_block = slab->reference_blocks;
slab->search_cursor.last_block = &slab->reference_blocks[slab->reference_block_count - 1];
reset_search_cursor(slab);
for (index = 0; index < slab->reference_block_count; index++) {
slab->reference_blocks[index] = (struct reference_block) {
.slab = slab,
};
}
return VDO_SUCCESS;
}
static int allocate_counters_if_clean(struct vdo_slab *slab)
{
if (vdo_is_state_clean_load(&slab->state))
return allocate_slab_counters(slab);
return VDO_SUCCESS;
}
static void finish_loading_journal(struct vdo_completion *completion)
{
struct vio *vio = as_vio(completion);
struct slab_journal *journal = completion->parent;
struct vdo_slab *slab = journal->slab;
struct packed_slab_journal_block *block = (struct packed_slab_journal_block *) vio->data;
struct slab_journal_block_header header;
vdo_unpack_slab_journal_block_header(&block->header, &header);
if ((header.metadata_type == VDO_METADATA_SLAB_JOURNAL) &&
(header.nonce == slab->allocator->nonce)) {
journal->tail = header.sequence_number + 1;
journal->head = (slab->allocator->summary_entries[slab->slab_number].is_dirty ?
header.head : journal->tail);
journal->tail_header = header;
initialize_journal_state(journal);
}
return_vio_to_pool(vio_as_pooled_vio(vio));
vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab));
}
static void read_slab_journal_tail_endio(struct bio *bio)
{
struct vio *vio = bio->bi_private;
struct slab_journal *journal = vio->completion.parent;
continue_vio_after_io(vio, finish_loading_journal,
journal->slab->allocator->thread_id);
}
static void handle_load_error(struct vdo_completion *completion)
{
int result = completion->result;
struct slab_journal *journal = completion->parent;
struct vio *vio = as_vio(completion);
vio_record_metadata_io_error(vio);
return_vio_to_pool(vio_as_pooled_vio(vio));
vdo_finish_loading_with_result(&journal->slab->state, result);
}
static void read_slab_journal_tail(struct vdo_waiter *waiter, void *context)
{
struct slab_journal *journal =
container_of(waiter, struct slab_journal, resource_waiter);
struct vdo_slab *slab = journal->slab;
struct pooled_vio *pooled = context;
struct vio *vio = &pooled->vio;
tail_block_offset_t last_commit_point =
slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
tail_block_offset_t tail_block = ((last_commit_point == 0) ?
(tail_block_offset_t)(journal->size - 1) :
(last_commit_point - 1));
vio->completion.parent = journal;
vio->completion.callback_thread_id = slab->allocator->thread_id;
vdo_submit_metadata_vio(vio, slab->journal_origin + tail_block,
read_slab_journal_tail_endio, handle_load_error,
REQ_OP_READ);
}
static void load_slab_journal(struct vdo_slab *slab)
{
struct slab_journal *journal = &slab->journal;
tail_block_offset_t last_commit_point;
last_commit_point = slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
if ((last_commit_point == 0) &&
!slab->allocator->summary_entries[slab->slab_number].load_ref_counts) {
VDO_ASSERT_LOG_ONLY(((journal->size < 16) ||
(journal->scrubbing_threshold < (journal->size - 1))),
"Scrubbing threshold protects against reads of unwritten slab journal blocks");
vdo_finish_loading_with_result(&slab->state,
allocate_counters_if_clean(slab));
return;
}
journal->resource_waiter.callback = read_slab_journal_tail;
acquire_vio_from_pool(slab->allocator->vio_pool, &journal->resource_waiter);
}
static void register_slab_for_scrubbing(struct vdo_slab *slab, bool high_priority)
{
struct slab_scrubber *scrubber = &slab->allocator->scrubber;
VDO_ASSERT_LOG_ONLY((slab->status != VDO_SLAB_REBUILT),
"slab to be scrubbed is unrecovered");
if (slab->status != VDO_SLAB_REQUIRES_SCRUBBING)
return;
list_del_init(&slab->allocq_entry);
if (!slab->was_queued_for_scrubbing) {
WRITE_ONCE(scrubber->slab_count, scrubber->slab_count + 1);
slab->was_queued_for_scrubbing = true;
}
if (high_priority) {
slab->status = VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING;
list_add_tail(&slab->allocq_entry, &scrubber->high_priority_slabs);
return;
}
list_add_tail(&slab->allocq_entry, &scrubber->slabs);
}
static void queue_slab(struct vdo_slab *slab)
{
struct block_allocator *allocator = slab->allocator;
block_count_t free_blocks;
int result;
VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
"a requeued slab must not already be on a list");
if (vdo_is_read_only(allocator->depot->vdo))
return;
free_blocks = slab->free_blocks;
result = VDO_ASSERT((free_blocks <= allocator->depot->slab_config.data_blocks),
"rebuilt slab %u must have a valid free block count (has %llu, expected maximum %llu)",
slab->slab_number, (unsigned long long) free_blocks,
(unsigned long long) allocator->depot->slab_config.data_blocks);
if (result != VDO_SUCCESS) {
vdo_enter_read_only_mode(allocator->depot->vdo, result);
return;
}
if (slab->status != VDO_SLAB_REBUILT) {
register_slab_for_scrubbing(slab, false);
return;
}
if (!vdo_is_state_resuming(&slab->state)) {
WRITE_ONCE(allocator->allocated_blocks,
allocator->allocated_blocks - free_blocks);
if (!is_slab_journal_blank(slab)) {
WRITE_ONCE(allocator->statistics.slabs_opened,
allocator->statistics.slabs_opened + 1);
}
}
if (allocator->depot->vdo->suspend_type == VDO_ADMIN_STATE_SAVING)
reopen_slab_journal(slab);
prioritize_slab(slab);
}
static void initiate_slab_action(struct admin_state *state)
{
struct vdo_slab *slab = container_of(state, struct vdo_slab, state);
if (vdo_is_state_draining(state)) {
const struct admin_state_code *operation = vdo_get_admin_state_code(state);
if (operation == VDO_ADMIN_STATE_SCRUBBING)
slab->status = VDO_SLAB_REBUILDING;
drain_slab(slab);
check_if_slab_drained(slab);
return;
}
if (vdo_is_state_loading(state)) {
load_slab_journal(slab);
return;
}
if (vdo_is_state_resuming(state)) {
queue_slab(slab);
vdo_finish_resuming(state);
return;
}
vdo_finish_operation(state, VDO_INVALID_ADMIN_STATE);
}
static struct vdo_slab *get_next_slab(struct slab_scrubber *scrubber)
{
struct vdo_slab *slab;
slab = list_first_entry_or_null(&scrubber->high_priority_slabs,
struct vdo_slab, allocq_entry);
if (slab != NULL)
return slab;
return list_first_entry_or_null(&scrubber->slabs, struct vdo_slab,
allocq_entry);
}
static inline bool __must_check has_slabs_to_scrub(struct slab_scrubber *scrubber)
{
return (get_next_slab(scrubber) != NULL);
}
static void uninitialize_scrubber_vio(struct slab_scrubber *scrubber)
{
vdo_free(vdo_forget(scrubber->vio.data));
free_vio_components(&scrubber->vio);
}
static void finish_scrubbing(struct slab_scrubber *scrubber, int result)
{
bool notify = vdo_waitq_has_waiters(&scrubber->waiters);
bool done = !has_slabs_to_scrub(scrubber);
struct block_allocator *allocator =
container_of(scrubber, struct block_allocator, scrubber);
if (done)
uninitialize_scrubber_vio(scrubber);
if (scrubber->high_priority_only) {
scrubber->high_priority_only = false;
vdo_fail_completion(vdo_forget(scrubber->vio.completion.parent), result);
} else if (done && (atomic_add_return(-1, &allocator->depot->zones_to_scrub) == 0)) {
enum vdo_state prior_state =
atomic_cmpxchg(&allocator->depot->vdo->state, VDO_RECOVERING,
VDO_DIRTY);
smp_mb__after_atomic();
if (prior_state == VDO_DIRTY)
vdo_log_info("VDO commencing normal operation");
else if (prior_state == VDO_RECOVERING)
vdo_log_info("Exiting recovery mode");
free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool));
}
if (!vdo_finish_draining(&scrubber->admin_state))
WRITE_ONCE(scrubber->admin_state.current_state,
VDO_ADMIN_STATE_SUSPENDED);
if (notify)
vdo_waitq_notify_all_waiters(&scrubber->waiters, NULL, NULL);
}
static void scrub_next_slab(struct slab_scrubber *scrubber);
static void slab_scrubbed(struct vdo_completion *completion)
{
struct slab_scrubber *scrubber =
container_of(as_vio(completion), struct slab_scrubber, vio);
struct vdo_slab *slab = scrubber->slab;
slab->status = VDO_SLAB_REBUILT;
queue_slab(slab);
reopen_slab_journal(slab);
WRITE_ONCE(scrubber->slab_count, scrubber->slab_count - 1);
scrub_next_slab(scrubber);
}
static void abort_scrubbing(struct slab_scrubber *scrubber, int result)
{
vdo_enter_read_only_mode(scrubber->vio.completion.vdo, result);
finish_scrubbing(scrubber, result);
}
static void handle_scrubber_error(struct vdo_completion *completion)
{
struct vio *vio = as_vio(completion);
vio_record_metadata_io_error(vio);
abort_scrubbing(container_of(vio, struct slab_scrubber, vio),
completion->result);
}
static int apply_block_entries(struct packed_slab_journal_block *block,
journal_entry_count_t entry_count,
sequence_number_t block_number, struct vdo_slab *slab)
{
struct journal_point entry_point = {
.sequence_number = block_number,
.entry_count = 0,
};
int result;
slab_block_number max_sbn = slab->end - slab->start;
while (entry_point.entry_count < entry_count) {
struct slab_journal_entry entry =
vdo_decode_slab_journal_entry(block, entry_point.entry_count);
if (entry.sbn > max_sbn) {
return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
"vdo_slab journal entry (%llu, %u) had invalid offset %u in slab (size %u blocks)",
(unsigned long long) block_number,
entry_point.entry_count,
entry.sbn, max_sbn);
}
result = replay_reference_count_change(slab, &entry_point, entry);
if (result != VDO_SUCCESS) {
vdo_log_error_strerror(result,
"vdo_slab journal entry (%llu, %u) (%s of offset %u) could not be applied in slab %u",
(unsigned long long) block_number,
entry_point.entry_count,
vdo_get_journal_operation_name(entry.operation),
entry.sbn, slab->slab_number);
return result;
}
entry_point.entry_count++;
}
return VDO_SUCCESS;
}
static void apply_journal_entries(struct vdo_completion *completion)
{
int result;
struct slab_scrubber *scrubber =
container_of(as_vio(completion), struct slab_scrubber, vio);
struct vdo_slab *slab = scrubber->slab;
struct slab_journal *journal = &slab->journal;
sequence_number_t tail = journal->tail;
tail_block_offset_t end_index = (tail - 1) % journal->size;
char *end_data = scrubber->vio.data + (end_index * VDO_BLOCK_SIZE);
struct packed_slab_journal_block *end_block =
(struct packed_slab_journal_block *) end_data;
sequence_number_t head = __le64_to_cpu(end_block->header.head);
tail_block_offset_t head_index = head % journal->size;
block_count_t index = head_index;
struct journal_point ref_counts_point = slab->slab_journal_point;
struct journal_point last_entry_applied = ref_counts_point;
sequence_number_t sequence;
for (sequence = head; sequence < tail; sequence++) {
char *block_data = scrubber->vio.data + (index * VDO_BLOCK_SIZE);
struct packed_slab_journal_block *block =
(struct packed_slab_journal_block *) block_data;
struct slab_journal_block_header header;
vdo_unpack_slab_journal_block_header(&block->header, &header);
if ((header.nonce != slab->allocator->nonce) ||
(header.metadata_type != VDO_METADATA_SLAB_JOURNAL) ||
(header.sequence_number != sequence) ||
(header.entry_count > journal->entries_per_block) ||
(header.has_block_map_increments &&
(header.entry_count > journal->full_entries_per_block))) {
vdo_log_error("vdo_slab journal block for slab %u was invalid",
slab->slab_number);
abort_scrubbing(scrubber, VDO_CORRUPT_JOURNAL);
return;
}
result = apply_block_entries(block, header.entry_count, sequence, slab);
if (result != VDO_SUCCESS) {
abort_scrubbing(scrubber, result);
return;
}
last_entry_applied.sequence_number = sequence;
last_entry_applied.entry_count = header.entry_count - 1;
index++;
if (index == journal->size)
index = 0;
}
result = VDO_ASSERT(!vdo_before_journal_point(&last_entry_applied,
&ref_counts_point),
"Refcounts are not more accurate than the slab journal");
if (result != VDO_SUCCESS) {
abort_scrubbing(scrubber, result);
return;
}
vdo_prepare_completion(completion, slab_scrubbed, handle_scrubber_error,
slab->allocator->thread_id, completion->parent);
vdo_start_operation_with_waiter(&slab->state,
VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING,
completion, initiate_slab_action);
}
static void read_slab_journal_endio(struct bio *bio)
{
struct vio *vio = bio->bi_private;
struct slab_scrubber *scrubber = container_of(vio, struct slab_scrubber, vio);
continue_vio_after_io(bio->bi_private, apply_journal_entries,
scrubber->slab->allocator->thread_id);
}
static void start_scrubbing(struct vdo_completion *completion)
{
struct slab_scrubber *scrubber =
container_of(as_vio(completion), struct slab_scrubber, vio);
struct vdo_slab *slab = scrubber->slab;
if (!slab->allocator->summary_entries[slab->slab_number].is_dirty) {
slab_scrubbed(completion);
return;
}
vdo_submit_metadata_vio(&scrubber->vio, slab->journal_origin,
read_slab_journal_endio, handle_scrubber_error,
REQ_OP_READ);
}
static void scrub_next_slab(struct slab_scrubber *scrubber)
{
struct vdo_completion *completion = &scrubber->vio.completion;
struct vdo_slab *slab;
vdo_waitq_notify_all_waiters(&scrubber->waiters, NULL, NULL);
if (vdo_is_read_only(completion->vdo)) {
finish_scrubbing(scrubber, VDO_READ_ONLY);
return;
}
slab = get_next_slab(scrubber);
if ((slab == NULL) ||
(scrubber->high_priority_only && list_empty(&scrubber->high_priority_slabs))) {
finish_scrubbing(scrubber, VDO_SUCCESS);
return;
}
if (vdo_finish_draining(&scrubber->admin_state))
return;
list_del_init(&slab->allocq_entry);
scrubber->slab = slab;
vdo_prepare_completion(completion, start_scrubbing, handle_scrubber_error,
slab->allocator->thread_id, completion->parent);
vdo_start_operation_with_waiter(&slab->state, VDO_ADMIN_STATE_SCRUBBING,
completion, initiate_slab_action);
}
static void scrub_slabs(struct block_allocator *allocator, struct vdo_completion *parent)
{
struct slab_scrubber *scrubber = &allocator->scrubber;
scrubber->vio.completion.parent = parent;
scrubber->high_priority_only = (parent != NULL);
if (!has_slabs_to_scrub(scrubber)) {
finish_scrubbing(scrubber, VDO_SUCCESS);
return;
}
if (scrubber->high_priority_only &&
vdo_is_priority_table_empty(allocator->prioritized_slabs) &&
list_empty(&scrubber->high_priority_slabs))
register_slab_for_scrubbing(get_next_slab(scrubber), true);
vdo_resume_if_quiescent(&scrubber->admin_state);
scrub_next_slab(scrubber);
}
static inline void assert_on_allocator_thread(thread_id_t thread_id,
const char *function_name)
{
VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == thread_id),
"%s called on correct thread", function_name);
}
static void register_slab_with_allocator(struct block_allocator *allocator,
struct vdo_slab *slab)
{
allocator->slab_count++;
allocator->last_slab = slab->slab_number;
}
static struct slab_iterator get_depot_slab_iterator(struct slab_depot *depot,
slab_count_t start, slab_count_t end,
slab_count_t stride)
{
struct vdo_slab **slabs = depot->slabs;
return (struct slab_iterator) {
.slabs = slabs,
.next = (((slabs == NULL) || (start < end)) ? NULL : slabs[start]),
.end = end,
.stride = stride,
};
}
static struct slab_iterator get_slab_iterator(const struct block_allocator *allocator)
{
return get_depot_slab_iterator(allocator->depot, allocator->last_slab,
allocator->zone_number,
allocator->depot->zone_count);
}
static struct vdo_slab *next_slab(struct slab_iterator *iterator)
{
struct vdo_slab *slab = iterator->next;
if ((slab == NULL) || (slab->slab_number < iterator->end + iterator->stride))
iterator->next = NULL;
else
iterator->next = iterator->slabs[slab->slab_number - iterator->stride];
return slab;
}
static void abort_waiter(struct vdo_waiter *waiter, void __always_unused *context)
{
struct reference_updater *updater =
container_of(waiter, struct reference_updater, waiter);
struct data_vio *data_vio = data_vio_from_reference_updater(updater);
if (updater->increment) {
continue_data_vio_with_error(data_vio, VDO_READ_ONLY);
return;
}
vdo_continue_completion(&data_vio->decrement_completion, VDO_READ_ONLY);
}
static void notify_block_allocator_of_read_only_mode(void *listener,
struct vdo_completion *parent)
{
struct block_allocator *allocator = listener;
struct slab_iterator iterator;
assert_on_allocator_thread(allocator->thread_id, __func__);
iterator = get_slab_iterator(allocator);
while (iterator.next != NULL) {
struct vdo_slab *slab = next_slab(&iterator);
vdo_waitq_notify_all_waiters(&slab->journal.entry_waiters,
abort_waiter, &slab->journal);
check_if_slab_drained(slab);
}
vdo_finish_completion(parent);
}
int vdo_acquire_provisional_reference(struct vdo_slab *slab, physical_block_number_t pbn,
struct pbn_lock *lock)
{
slab_block_number block_number;
int result;
if (vdo_pbn_lock_has_provisional_reference(lock))
return VDO_SUCCESS;
if (!is_slab_open(slab))
return VDO_INVALID_ADMIN_STATE;
result = slab_block_number_from_pbn(slab, pbn, &block_number);
if (result != VDO_SUCCESS)
return result;
if (slab->counters[block_number] == EMPTY_REFERENCE_COUNT) {
make_provisional_reference(slab, block_number);
if (lock != NULL)
vdo_assign_pbn_lock_provisional_reference(lock);
}
if (vdo_pbn_lock_has_provisional_reference(lock))
adjust_free_block_count(slab, false);
return VDO_SUCCESS;
}
static int __must_check allocate_slab_block(struct vdo_slab *slab,
physical_block_number_t *block_number_ptr)
{
slab_block_number free_index;
if (!is_slab_open(slab))
return VDO_INVALID_ADMIN_STATE;
if (!search_reference_blocks(slab, &free_index))
return VDO_NO_SPACE;
VDO_ASSERT_LOG_ONLY((slab->counters[free_index] == EMPTY_REFERENCE_COUNT),
"free block must have ref count of zero");
make_provisional_reference(slab, free_index);
adjust_free_block_count(slab, false);
slab->search_cursor.index = (free_index + 1);
*block_number_ptr = slab->start + free_index;
return VDO_SUCCESS;
}
static void open_slab(struct vdo_slab *slab)
{
reset_search_cursor(slab);
if (is_slab_journal_blank(slab)) {
WRITE_ONCE(slab->allocator->statistics.slabs_opened,
slab->allocator->statistics.slabs_opened + 1);
dirty_all_reference_blocks(slab);
} else {
WRITE_ONCE(slab->allocator->statistics.slabs_reopened,
slab->allocator->statistics.slabs_reopened + 1);
}
slab->allocator->open_slab = slab;
}
int vdo_allocate_block(struct block_allocator *allocator,
physical_block_number_t *block_number_ptr)
{
int result;
if (allocator->open_slab != NULL) {
result = allocate_slab_block(allocator->open_slab, block_number_ptr);
if ((result == VDO_SUCCESS) || (result != VDO_NO_SPACE))
return result;
prioritize_slab(allocator->open_slab);
}
open_slab(list_entry(vdo_priority_table_dequeue(allocator->prioritized_slabs),
struct vdo_slab, allocq_entry));
return allocate_slab_block(allocator->open_slab, block_number_ptr);
}
int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator,
struct vdo_waiter *waiter)
{
if (vdo_is_read_only(allocator->depot->vdo))
return VDO_READ_ONLY;
if (vdo_is_state_quiescent(&allocator->scrubber.admin_state))
return VDO_NO_SPACE;
vdo_waitq_enqueue_waiter(&allocator->scrubber.waiters, waiter);
return VDO_SUCCESS;
}
void vdo_modify_reference_count(struct vdo_completion *completion,
struct reference_updater *updater)
{
struct vdo_slab *slab = vdo_get_slab(completion->vdo->depot, updater->zpbn.pbn);
if (!is_slab_open(slab)) {
vdo_continue_completion(completion, VDO_INVALID_ADMIN_STATE);
return;
}
if (vdo_is_read_only(completion->vdo)) {
vdo_continue_completion(completion, VDO_READ_ONLY);
return;
}
vdo_waitq_enqueue_waiter(&slab->journal.entry_waiters, &updater->waiter);
if ((slab->status != VDO_SLAB_REBUILT) && requires_reaping(&slab->journal))
register_slab_for_scrubbing(slab, true);
add_entries(&slab->journal);
}
int vdo_release_block_reference(struct block_allocator *allocator,
physical_block_number_t pbn)
{
struct reference_updater updater;
if (pbn == VDO_ZERO_BLOCK)
return VDO_SUCCESS;
updater = (struct reference_updater) {
.operation = VDO_JOURNAL_DATA_REMAPPING,
.increment = false,
.zpbn = {
.pbn = pbn,
},
};
return adjust_reference_count(vdo_get_slab(allocator->depot, pbn),
&updater, NULL);
}
static bool slab_status_is_less_than(const void *item1, const void *item2,
void __always_unused *args)
{
const struct slab_status *info1 = item1;
const struct slab_status *info2 = item2;
if (info1->is_clean != info2->is_clean)
return info1->is_clean;
if (info1->emptiness != info2->emptiness)
return info1->emptiness > info2->emptiness;
return info1->slab_number < info2->slab_number;
}
static const struct min_heap_callbacks slab_status_min_heap = {
.less = slab_status_is_less_than,
.swp = NULL,
};
static void slab_action_callback(struct vdo_completion *completion)
{
struct block_allocator *allocator = vdo_as_block_allocator(completion);
struct slab_actor *actor = &allocator->slab_actor;
if (--actor->slab_action_count == 0) {
actor->callback(completion);
return;
}
vdo_reset_completion(completion);
}
static void handle_operation_error(struct vdo_completion *completion)
{
struct block_allocator *allocator = vdo_as_block_allocator(completion);
if (allocator->state.waiter != NULL)
vdo_set_completion_result(allocator->state.waiter, completion->result);
completion->callback(completion);
}
static void apply_to_slabs(struct block_allocator *allocator, vdo_action_fn callback)
{
struct slab_iterator iterator;
vdo_prepare_completion(&allocator->completion, slab_action_callback,
handle_operation_error, allocator->thread_id, NULL);
allocator->completion.requeue = false;
allocator->open_slab = NULL;
allocator->slab_actor = (struct slab_actor) {
.slab_action_count = 1,
.callback = callback,
};
iterator = get_slab_iterator(allocator);
while (iterator.next != NULL) {
const struct admin_state_code *operation =
vdo_get_admin_state_code(&allocator->state);
struct vdo_slab *slab = next_slab(&iterator);
list_del_init(&slab->allocq_entry);
allocator->slab_actor.slab_action_count++;
vdo_start_operation_with_waiter(&slab->state, operation,
&allocator->completion,
initiate_slab_action);
}
slab_action_callback(&allocator->completion);
}
static void finish_loading_allocator(struct vdo_completion *completion)
{
struct block_allocator *allocator = vdo_as_block_allocator(completion);
const struct admin_state_code *operation =
vdo_get_admin_state_code(&allocator->state);
if (allocator->eraser != NULL)
dm_kcopyd_client_destroy(vdo_forget(allocator->eraser));
if (operation == VDO_ADMIN_STATE_LOADING_FOR_RECOVERY) {
void *context =
vdo_get_current_action_context(allocator->depot->action_manager);
vdo_replay_into_slab_journals(allocator, context);
return;
}
vdo_finish_loading(&allocator->state);
}
static void erase_next_slab_journal(struct block_allocator *allocator);
static void copy_callback(int read_err, unsigned long write_err, void *context)
{
struct block_allocator *allocator = context;
int result = (((read_err == 0) && (write_err == 0)) ? VDO_SUCCESS : -EIO);
if (result != VDO_SUCCESS) {
vdo_fail_completion(&allocator->completion, result);
return;
}
erase_next_slab_journal(allocator);
}
static void erase_next_slab_journal(struct block_allocator *allocator)
{
struct vdo_slab *slab;
physical_block_number_t pbn;
struct dm_io_region regions[1];
struct slab_depot *depot = allocator->depot;
block_count_t blocks = depot->slab_config.slab_journal_blocks;
if (allocator->slabs_to_erase.next == NULL) {
vdo_finish_completion(&allocator->completion);
return;
}
slab = next_slab(&allocator->slabs_to_erase);
pbn = slab->journal_origin - depot->vdo->geometry.bio_offset;
regions[0] = (struct dm_io_region) {
.bdev = vdo_get_backing_device(depot->vdo),
.sector = pbn * VDO_SECTORS_PER_BLOCK,
.count = blocks * VDO_SECTORS_PER_BLOCK,
};
dm_kcopyd_zero(allocator->eraser, 1, regions, 0, copy_callback, allocator);
}
static void initiate_load(struct admin_state *state)
{
struct block_allocator *allocator =
container_of(state, struct block_allocator, state);
const struct admin_state_code *operation = vdo_get_admin_state_code(state);
if (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD) {
vdo_prepare_completion_for_requeue(&allocator->completion,
finish_loading_allocator,
handle_operation_error,
allocator->thread_id, NULL);
allocator->eraser = dm_kcopyd_client_create(NULL);
if (IS_ERR(allocator->eraser)) {
vdo_fail_completion(&allocator->completion,
PTR_ERR(allocator->eraser));
allocator->eraser = NULL;
return;
}
allocator->slabs_to_erase = get_slab_iterator(allocator);
erase_next_slab_journal(allocator);
return;
}
apply_to_slabs(allocator, finish_loading_allocator);
}
void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion)
{
struct block_allocator *allocator = vdo_as_block_allocator(completion);
vdo_finish_loading_with_result(&allocator->state, completion->result);
}
static int get_slab_statuses(struct block_allocator *allocator,
struct slab_status **statuses_ptr)
{
int result;
struct slab_status *statuses;
struct slab_iterator iterator = get_slab_iterator(allocator);
result = vdo_allocate(allocator->slab_count, struct slab_status, __func__,
&statuses);
if (result != VDO_SUCCESS)
return result;
*statuses_ptr = statuses;
while (iterator.next != NULL) {
slab_count_t slab_number = next_slab(&iterator)->slab_number;
*statuses++ = (struct slab_status) {
.slab_number = slab_number,
.is_clean = !allocator->summary_entries[slab_number].is_dirty,
.emptiness = allocator->summary_entries[slab_number].fullness_hint,
};
}
return VDO_SUCCESS;
}
static int __must_check vdo_prepare_slabs_for_allocation(struct block_allocator *allocator)
{
struct slab_status current_slab_status;
DEFINE_MIN_HEAP(struct slab_status, heap) heap;
int result;
struct slab_status *slab_statuses;
struct slab_depot *depot = allocator->depot;
WRITE_ONCE(allocator->allocated_blocks,
allocator->slab_count * depot->slab_config.data_blocks);
result = get_slab_statuses(allocator, &slab_statuses);
if (result != VDO_SUCCESS)
return result;
heap = (struct heap) {
.data = slab_statuses,
.nr = allocator->slab_count,
.size = allocator->slab_count,
};
min_heapify_all(&heap, &slab_status_min_heap, NULL);
while (heap.nr > 0) {
bool high_priority;
struct vdo_slab *slab;
struct slab_journal *journal;
current_slab_status = slab_statuses[0];
min_heap_pop(&heap, &slab_status_min_heap, NULL);
slab = depot->slabs[current_slab_status.slab_number];
if ((depot->load_type == VDO_SLAB_DEPOT_REBUILD_LOAD) ||
(!allocator->summary_entries[slab->slab_number].load_ref_counts &&
current_slab_status.is_clean)) {
queue_slab(slab);
continue;
}
slab->status = VDO_SLAB_REQUIRES_SCRUBBING;
journal = &slab->journal;
high_priority = ((current_slab_status.is_clean &&
(depot->load_type == VDO_SLAB_DEPOT_NORMAL_LOAD)) ||
(journal_length(journal) >= journal->scrubbing_threshold));
register_slab_for_scrubbing(slab, high_priority);
}
vdo_free(slab_statuses);
return VDO_SUCCESS;
}
static const char *status_to_string(enum slab_rebuild_status status)
{
switch (status) {
case VDO_SLAB_REBUILT:
return "REBUILT";
case VDO_SLAB_REQUIRES_SCRUBBING:
return "SCRUBBING";
case VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING:
return "PRIORITY_SCRUBBING";
case VDO_SLAB_REBUILDING:
return "REBUILDING";
case VDO_SLAB_REPLAYING:
return "REPLAYING";
default:
return "UNKNOWN";
}
}
void vdo_dump_block_allocator(const struct block_allocator *allocator)
{
unsigned int pause_counter = 0;
struct slab_iterator iterator = get_slab_iterator(allocator);
const struct slab_scrubber *scrubber = &allocator->scrubber;
vdo_log_info("block_allocator zone %u", allocator->zone_number);
while (iterator.next != NULL) {
struct vdo_slab *slab = next_slab(&iterator);
struct slab_journal *journal = &slab->journal;
if (slab->reference_blocks != NULL) {
vdo_log_info("slab %u: P%u, %llu free", slab->slab_number,
slab->priority,
(unsigned long long) slab->free_blocks);
} else {
vdo_log_info("slab %u: status %s", slab->slab_number,
status_to_string(slab->status));
}
vdo_log_info(" slab journal: entry_waiters=%zu waiting_to_commit=%s updating_slab_summary=%s head=%llu unreapable=%llu tail=%llu next_commit=%llu summarized=%llu last_summarized=%llu recovery_lock=%llu dirty=%s",
vdo_waitq_num_waiters(&journal->entry_waiters),
vdo_bool_to_string(journal->waiting_to_commit),
vdo_bool_to_string(journal->updating_slab_summary),
(unsigned long long) journal->head,
(unsigned long long) journal->unreapable,
(unsigned long long) journal->tail,
(unsigned long long) journal->next_commit,
(unsigned long long) journal->summarized,
(unsigned long long) journal->last_summarized,
(unsigned long long) journal->recovery_lock,
vdo_bool_to_string(journal->recovery_lock != 0));
if (slab->counters != NULL) {
vdo_log_info(" slab: free=%u/%u blocks=%u dirty=%zu active=%zu journal@(%llu,%u)",
slab->free_blocks, slab->block_count,
slab->reference_block_count,
vdo_waitq_num_waiters(&slab->dirty_blocks),
slab->active_count,
(unsigned long long) slab->slab_journal_point.sequence_number,
slab->slab_journal_point.entry_count);
} else {
vdo_log_info(" no counters");
}
if (pause_counter++ == 31) {
pause_counter = 0;
vdo_pause_for_logger();
}
}
vdo_log_info("slab_scrubber slab_count %u waiters %zu %s%s",
READ_ONCE(scrubber->slab_count),
vdo_waitq_num_waiters(&scrubber->waiters),
vdo_get_admin_state_code(&scrubber->admin_state)->name,
scrubber->high_priority_only ? ", high_priority_only " : "");
}
static void free_slab(struct vdo_slab *slab)
{
if (slab == NULL)
return;
list_del(&slab->allocq_entry);
vdo_free(vdo_forget(slab->journal.block));
vdo_free(vdo_forget(slab->journal.locks));
vdo_free(vdo_forget(slab->counters));
vdo_free(vdo_forget(slab->reference_blocks));
vdo_free(slab);
}
static int initialize_slab_journal(struct vdo_slab *slab)
{
struct slab_journal *journal = &slab->journal;
const struct slab_config *slab_config = &slab->allocator->depot->slab_config;
int result;
result = vdo_allocate(slab_config->slab_journal_blocks, struct journal_lock,
__func__, &journal->locks);
if (result != VDO_SUCCESS)
return result;
result = vdo_allocate(VDO_BLOCK_SIZE, char, "struct packed_slab_journal_block",
(char **) &journal->block);
if (result != VDO_SUCCESS)
return result;
journal->slab = slab;
journal->size = slab_config->slab_journal_blocks;
journal->flushing_threshold = slab_config->slab_journal_flushing_threshold;
journal->blocking_threshold = slab_config->slab_journal_blocking_threshold;
journal->scrubbing_threshold = slab_config->slab_journal_scrubbing_threshold;
journal->entries_per_block = VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK;
journal->full_entries_per_block = VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK;
journal->events = &slab->allocator->slab_journal_statistics;
journal->recovery_journal = slab->allocator->depot->vdo->recovery_journal;
journal->tail = 1;
journal->head = 1;
journal->flushing_deadline = journal->flushing_threshold;
if ((journal->blocking_threshold - journal->flushing_threshold) > 5)
journal->flushing_deadline = journal->blocking_threshold - 5;
journal->slab_summary_waiter.callback = release_journal_locks;
INIT_LIST_HEAD(&journal->dirty_entry);
INIT_LIST_HEAD(&journal->uncommitted_blocks);
journal->tail_header.nonce = slab->allocator->nonce;
journal->tail_header.metadata_type = VDO_METADATA_SLAB_JOURNAL;
initialize_journal_state(journal);
return VDO_SUCCESS;
}
static int __must_check make_slab(physical_block_number_t slab_origin,
struct block_allocator *allocator,
slab_count_t slab_number, bool is_new,
struct vdo_slab **slab_ptr)
{
const struct slab_config *slab_config = &allocator->depot->slab_config;
struct vdo_slab *slab;
int result;
result = vdo_allocate(1, struct vdo_slab, __func__, &slab);
if (result != VDO_SUCCESS)
return result;
*slab = (struct vdo_slab) {
.allocator = allocator,
.start = slab_origin,
.end = slab_origin + slab_config->slab_blocks,
.slab_number = slab_number,
.ref_counts_origin = slab_origin + slab_config->data_blocks,
.journal_origin =
vdo_get_slab_journal_start_block(slab_config, slab_origin),
.block_count = slab_config->data_blocks,
.free_blocks = slab_config->data_blocks,
.reference_block_count =
vdo_get_saved_reference_count_size(slab_config->data_blocks),
};
INIT_LIST_HEAD(&slab->allocq_entry);
result = initialize_slab_journal(slab);
if (result != VDO_SUCCESS) {
free_slab(slab);
return result;
}
if (is_new) {
vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NEW);
result = allocate_slab_counters(slab);
if (result != VDO_SUCCESS) {
free_slab(slab);
return result;
}
} else {
vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
}
*slab_ptr = slab;
return VDO_SUCCESS;
}
static int allocate_slabs(struct slab_depot *depot, slab_count_t slab_count)
{
block_count_t slab_size;
bool resizing = false;
physical_block_number_t slab_origin;
int result;
result = vdo_allocate(slab_count, struct vdo_slab *,
"slab pointer array", &depot->new_slabs);
if (result != VDO_SUCCESS)
return result;
if (depot->slabs != NULL) {
memcpy(depot->new_slabs, depot->slabs,
depot->slab_count * sizeof(struct vdo_slab *));
resizing = true;
}
slab_size = depot->slab_config.slab_blocks;
slab_origin = depot->first_block + (depot->slab_count * slab_size);
for (depot->new_slab_count = depot->slab_count;
depot->new_slab_count < slab_count;
depot->new_slab_count++, slab_origin += slab_size) {
struct block_allocator *allocator =
&depot->allocators[depot->new_slab_count % depot->zone_count];
struct vdo_slab **slab_ptr = &depot->new_slabs[depot->new_slab_count];
result = make_slab(slab_origin, allocator, depot->new_slab_count,
resizing, slab_ptr);
if (result != VDO_SUCCESS)
return result;
}
return VDO_SUCCESS;
}
void vdo_abandon_new_slabs(struct slab_depot *depot)
{
slab_count_t i;
if (depot->new_slabs == NULL)
return;
for (i = depot->slab_count; i < depot->new_slab_count; i++)
free_slab(vdo_forget(depot->new_slabs[i]));
depot->new_slab_count = 0;
depot->new_size = 0;
vdo_free(vdo_forget(depot->new_slabs));
}
static thread_id_t get_allocator_thread_id(void *context, zone_count_t zone_number)
{
return ((struct slab_depot *) context)->allocators[zone_number].thread_id;
}
static bool __must_check release_recovery_journal_lock(struct slab_journal *journal,
sequence_number_t recovery_lock)
{
if (recovery_lock > journal->recovery_lock) {
VDO_ASSERT_LOG_ONLY((recovery_lock < journal->recovery_lock),
"slab journal recovery lock is not older than the recovery journal head");
return false;
}
if ((recovery_lock < journal->recovery_lock) ||
vdo_is_read_only(journal->slab->allocator->depot->vdo))
return false;
commit_tail(journal);
return true;
}
static void release_tail_block_locks(void *context, zone_count_t zone_number,
struct vdo_completion *parent)
{
struct slab_journal *journal, *tmp;
struct slab_depot *depot = context;
struct list_head *list = &depot->allocators[zone_number].dirty_slab_journals;
list_for_each_entry_safe(journal, tmp, list, dirty_entry) {
if (!release_recovery_journal_lock(journal,
depot->active_release_request))
break;
}
vdo_finish_completion(parent);
}
static void prepare_for_tail_block_commit(void *context, struct vdo_completion *parent)
{
struct slab_depot *depot = context;
depot->active_release_request = depot->new_release_request;
vdo_finish_completion(parent);
}
static bool schedule_tail_block_commit(void *context)
{
struct slab_depot *depot = context;
if (depot->new_release_request == depot->active_release_request)
return false;
return vdo_schedule_action(depot->action_manager,
prepare_for_tail_block_commit,
release_tail_block_locks,
NULL, NULL);
}
static int initialize_slab_scrubber(struct block_allocator *allocator)
{
struct slab_scrubber *scrubber = &allocator->scrubber;
block_count_t slab_journal_size =
allocator->depot->slab_config.slab_journal_blocks;
char *journal_data;
int result;
result = vdo_allocate(VDO_BLOCK_SIZE * slab_journal_size,
char, __func__, &journal_data);
if (result != VDO_SUCCESS)
return result;
result = allocate_vio_components(allocator->completion.vdo,
VIO_TYPE_SLAB_JOURNAL,
VIO_PRIORITY_METADATA,
allocator, slab_journal_size,
journal_data, &scrubber->vio);
if (result != VDO_SUCCESS) {
vdo_free(journal_data);
return result;
}
INIT_LIST_HEAD(&scrubber->high_priority_slabs);
INIT_LIST_HEAD(&scrubber->slabs);
vdo_set_admin_state_code(&scrubber->admin_state, VDO_ADMIN_STATE_SUSPENDED);
return VDO_SUCCESS;
}
static int __must_check initialize_slab_summary_block(struct block_allocator *allocator,
block_count_t index)
{
struct slab_summary_block *block = &allocator->summary_blocks[index];
int result;
result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &block->outgoing_entries);
if (result != VDO_SUCCESS)
return result;
result = allocate_vio_components(allocator->depot->vdo, VIO_TYPE_SLAB_SUMMARY,
VIO_PRIORITY_METADATA, NULL, 1,
block->outgoing_entries, &block->vio);
if (result != VDO_SUCCESS)
return result;
block->allocator = allocator;
block->entries = &allocator->summary_entries[VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK * index];
block->index = index;
return VDO_SUCCESS;
}
static int __must_check initialize_block_allocator(struct slab_depot *depot,
zone_count_t zone)
{
int result;
block_count_t i;
struct block_allocator *allocator = &depot->allocators[zone];
struct vdo *vdo = depot->vdo;
block_count_t max_free_blocks = depot->slab_config.data_blocks;
unsigned int max_priority = (2 + ilog2(max_free_blocks));
u32 reference_block_count, refcount_reads_needed, refcount_blocks_per_vio;
*allocator = (struct block_allocator) {
.depot = depot,
.zone_number = zone,
.thread_id = vdo->thread_config.physical_threads[zone],
.nonce = vdo->states.vdo.nonce,
};
INIT_LIST_HEAD(&allocator->dirty_slab_journals);
vdo_set_admin_state_code(&allocator->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
result = vdo_register_read_only_listener(vdo, allocator,
notify_block_allocator_of_read_only_mode,
allocator->thread_id);
if (result != VDO_SUCCESS)
return result;
vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION);
result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, 1, allocator->thread_id,
VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA,
allocator, &allocator->vio_pool);
if (result != VDO_SUCCESS)
return result;
reference_block_count = vdo_get_saved_reference_count_size(depot->slab_config.slab_blocks);
refcount_reads_needed = DIV_ROUND_UP(reference_block_count, MAX_BLOCKS_PER_VIO);
refcount_blocks_per_vio = DIV_ROUND_UP(reference_block_count, refcount_reads_needed);
allocator->refcount_blocks_per_big_vio = refcount_blocks_per_vio;
result = make_vio_pool(vdo, BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE,
allocator->refcount_blocks_per_big_vio, allocator->thread_id,
VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA,
NULL, &allocator->refcount_big_vio_pool);
if (result != VDO_SUCCESS)
return result;
result = initialize_slab_scrubber(allocator);
if (result != VDO_SUCCESS)
return result;
result = vdo_make_priority_table(max_priority, &allocator->prioritized_slabs);
if (result != VDO_SUCCESS)
return result;
result = vdo_allocate(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE,
struct slab_summary_block, __func__,
&allocator->summary_blocks);
if (result != VDO_SUCCESS)
return result;
vdo_set_admin_state_code(&allocator->summary_state,
VDO_ADMIN_STATE_NORMAL_OPERATION);
allocator->summary_entries = depot->summary_entries + (MAX_VDO_SLABS * zone);
for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) {
result = initialize_slab_summary_block(allocator, i);
if (result != VDO_SUCCESS)
return result;
}
allocator->unopened_slab_priority = (1 + ilog2((max_free_blocks * 3) / 4));
return VDO_SUCCESS;
}
static int allocate_components(struct slab_depot *depot,
struct partition *summary_partition)
{
int result;
zone_count_t zone;
slab_count_t slab_count;
u8 hint;
u32 i;
const struct thread_config *thread_config = &depot->vdo->thread_config;
result = vdo_make_action_manager(depot->zone_count, get_allocator_thread_id,
thread_config->journal_thread, depot,
schedule_tail_block_commit,
depot->vdo, &depot->action_manager);
if (result != VDO_SUCCESS)
return result;
depot->origin = depot->first_block;
BUILD_BUG_ON((VDO_BLOCK_SIZE % sizeof(struct slab_summary_entry)) != 0);
depot->summary_origin = summary_partition->offset;
depot->hint_shift = vdo_get_slab_summary_hint_shift(depot->slab_size_shift);
result = vdo_allocate(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES,
struct slab_summary_entry, __func__,
&depot->summary_entries);
if (result != VDO_SUCCESS)
return result;
hint = compute_fullness_hint(depot, depot->slab_config.data_blocks);
for (i = 0; i < MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES; i++) {
depot->summary_entries[i] = (struct slab_summary_entry) {
.tail_block_offset = 0,
.fullness_hint = hint,
.load_ref_counts = false,
.is_dirty = false,
};
}
slab_count = vdo_compute_slab_count(depot->first_block, depot->last_block,
depot->slab_size_shift);
if (thread_config->physical_zone_count > slab_count) {
return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
"%u physical zones exceeds slab count %u",
thread_config->physical_zone_count,
slab_count);
}
for (zone = 0; zone < depot->zone_count; zone++) {
result = initialize_block_allocator(depot, zone);
if (result != VDO_SUCCESS)
return result;
}
result = allocate_slabs(depot, slab_count);
if (result != VDO_SUCCESS)
return result;
for (i = depot->slab_count; i < depot->new_slab_count; i++) {
struct vdo_slab *slab = depot->new_slabs[i];
register_slab_with_allocator(slab->allocator, slab);
WRITE_ONCE(depot->slab_count, depot->slab_count + 1);
}
depot->slabs = depot->new_slabs;
depot->new_slabs = NULL;
depot->new_slab_count = 0;
return VDO_SUCCESS;
}
int vdo_decode_slab_depot(struct slab_depot_state_2_0 state, struct vdo *vdo,
struct partition *summary_partition,
struct slab_depot **depot_ptr)
{
unsigned int slab_size_shift;
struct slab_depot *depot;
int result;
block_count_t slab_size = state.slab_config.slab_blocks;
if (!is_power_of_2(slab_size)) {
return vdo_log_error_strerror(UDS_INVALID_ARGUMENT,
"slab size must be a power of two");
}
slab_size_shift = ilog2(slab_size);
result = vdo_allocate_extended(struct slab_depot,
vdo->thread_config.physical_zone_count,
struct block_allocator, __func__, &depot);
if (result != VDO_SUCCESS)
return result;
depot->vdo = vdo;
depot->old_zone_count = state.zone_count;
depot->zone_count = vdo->thread_config.physical_zone_count;
depot->slab_config = state.slab_config;
depot->first_block = state.first_block;
depot->last_block = state.last_block;
depot->slab_size_shift = slab_size_shift;
result = allocate_components(depot, summary_partition);
if (result != VDO_SUCCESS) {
vdo_free_slab_depot(depot);
return result;
}
*depot_ptr = depot;
return VDO_SUCCESS;
}
static void uninitialize_allocator_summary(struct block_allocator *allocator)
{
block_count_t i;
if (allocator->summary_blocks == NULL)
return;
for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) {
free_vio_components(&allocator->summary_blocks[i].vio);
vdo_free(vdo_forget(allocator->summary_blocks[i].outgoing_entries));
}
vdo_free(vdo_forget(allocator->summary_blocks));
}
void vdo_free_slab_depot(struct slab_depot *depot)
{
zone_count_t zone = 0;
if (depot == NULL)
return;
vdo_abandon_new_slabs(depot);
for (zone = 0; zone < depot->zone_count; zone++) {
struct block_allocator *allocator = &depot->allocators[zone];
if (allocator->eraser != NULL)
dm_kcopyd_client_destroy(vdo_forget(allocator->eraser));
uninitialize_allocator_summary(allocator);
uninitialize_scrubber_vio(&allocator->scrubber);
free_vio_pool(vdo_forget(allocator->vio_pool));
free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool));
vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs));
}
if (depot->slabs != NULL) {
slab_count_t i;
for (i = 0; i < depot->slab_count; i++)
free_slab(vdo_forget(depot->slabs[i]));
}
vdo_free(vdo_forget(depot->slabs));
vdo_free(vdo_forget(depot->action_manager));
vdo_free(vdo_forget(depot->summary_entries));
vdo_free(depot);
}
struct slab_depot_state_2_0 vdo_record_slab_depot(const struct slab_depot *depot)
{
struct slab_depot_state_2_0 state;
zone_count_t zones_to_record = depot->zone_count;
if (depot->zone_count == 0)
zones_to_record = depot->old_zone_count;
state = (struct slab_depot_state_2_0) {
.slab_config = depot->slab_config,
.first_block = depot->first_block,
.last_block = depot->last_block,
.zone_count = zones_to_record,
};
return state;
}
int vdo_allocate_reference_counters(struct slab_depot *depot)
{
struct slab_iterator iterator =
get_depot_slab_iterator(depot, depot->slab_count - 1, 0, 1);
while (iterator.next != NULL) {
int result = allocate_slab_counters(next_slab(&iterator));
if (result != VDO_SUCCESS)
return result;
}
return VDO_SUCCESS;
}
static int __must_check get_slab_number(const struct slab_depot *depot,
physical_block_number_t pbn,
slab_count_t *slab_number_ptr)
{
slab_count_t slab_number;
if (pbn < depot->first_block)
return VDO_OUT_OF_RANGE;
slab_number = (pbn - depot->first_block) >> depot->slab_size_shift;
if (slab_number >= depot->slab_count)
return VDO_OUT_OF_RANGE;
*slab_number_ptr = slab_number;
return VDO_SUCCESS;
}
struct vdo_slab *vdo_get_slab(const struct slab_depot *depot,
physical_block_number_t pbn)
{
slab_count_t slab_number;
int result;
if (pbn == VDO_ZERO_BLOCK)
return NULL;
result = get_slab_number(depot, pbn, &slab_number);
if (result != VDO_SUCCESS) {
vdo_enter_read_only_mode(depot->vdo, result);
return NULL;
}
return depot->slabs[slab_number];
}
u8 vdo_get_increment_limit(struct slab_depot *depot, physical_block_number_t pbn)
{
struct vdo_slab *slab = vdo_get_slab(depot, pbn);
vdo_refcount_t *counter_ptr = NULL;
int result;
if ((slab == NULL) || (slab->status != VDO_SLAB_REBUILT))
return 0;
result = get_reference_counter(slab, pbn, &counter_ptr);
if (result != VDO_SUCCESS)
return 0;
if (*counter_ptr == PROVISIONAL_REFERENCE_COUNT)
return (MAXIMUM_REFERENCE_COUNT - 1);
return (MAXIMUM_REFERENCE_COUNT - *counter_ptr);
}
bool vdo_is_physical_data_block(const struct slab_depot *depot,
physical_block_number_t pbn)
{
slab_count_t slab_number;
slab_block_number sbn;
return ((pbn == VDO_ZERO_BLOCK) ||
((get_slab_number(depot, pbn, &slab_number) == VDO_SUCCESS) &&
(slab_block_number_from_pbn(depot->slabs[slab_number], pbn, &sbn) ==
VDO_SUCCESS)));
}
block_count_t vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot)
{
block_count_t total = 0;
zone_count_t zone;
for (zone = 0; zone < depot->zone_count; zone++) {
total += READ_ONCE(depot->allocators[zone].allocated_blocks);
}
return total;
}
block_count_t vdo_get_slab_depot_data_blocks(const struct slab_depot *depot)
{
return (READ_ONCE(depot->slab_count) * depot->slab_config.data_blocks);
}
static void finish_combining_zones(struct vdo_completion *completion)
{
int result = completion->result;
struct vdo_completion *parent = completion->parent;
free_vio(as_vio(vdo_forget(completion)));
vdo_fail_completion(parent, result);
}
static void handle_combining_error(struct vdo_completion *completion)
{
vio_record_metadata_io_error(as_vio(completion));
finish_combining_zones(completion);
}
static void write_summary_endio(struct bio *bio)
{
struct vio *vio = bio->bi_private;
struct vdo *vdo = vio->completion.vdo;
continue_vio_after_io(vio, finish_combining_zones,
vdo->thread_config.admin_thread);
}
static void combine_summaries(struct slab_depot *depot)
{
zone_count_t zone = 0;
struct slab_summary_entry *entries = depot->summary_entries;
if (depot->old_zone_count > 1) {
slab_count_t entry_number;
for (entry_number = 0; entry_number < MAX_VDO_SLABS; entry_number++) {
if (zone != 0) {
memcpy(entries + entry_number,
entries + (zone * MAX_VDO_SLABS) + entry_number,
sizeof(struct slab_summary_entry));
}
zone++;
if (zone == depot->old_zone_count)
zone = 0;
}
}
for (zone = 1; zone < MAX_VDO_PHYSICAL_ZONES; zone++) {
memcpy(entries + (zone * MAX_VDO_SLABS), entries,
MAX_VDO_SLABS * sizeof(struct slab_summary_entry));
}
}
static void finish_loading_summary(struct vdo_completion *completion)
{
struct slab_depot *depot = completion->vdo->depot;
combine_summaries(depot);
vdo_submit_metadata_vio(as_vio(completion), depot->summary_origin,
write_summary_endio, handle_combining_error,
REQ_OP_WRITE);
}
static void load_summary_endio(struct bio *bio)
{
struct vio *vio = bio->bi_private;
struct vdo *vdo = vio->completion.vdo;
continue_vio_after_io(vio, finish_loading_summary,
vdo->thread_config.admin_thread);
}
static void load_slab_summary(void *context, struct vdo_completion *parent)
{
int result;
struct vio *vio;
struct slab_depot *depot = context;
const struct admin_state_code *operation =
vdo_get_current_manager_operation(depot->action_manager);
result = create_multi_block_metadata_vio(depot->vdo, VIO_TYPE_SLAB_SUMMARY,
VIO_PRIORITY_METADATA, parent,
VDO_SLAB_SUMMARY_BLOCKS,
(char *) depot->summary_entries, &vio);
if (result != VDO_SUCCESS) {
vdo_fail_completion(parent, result);
return;
}
if ((operation == VDO_ADMIN_STATE_FORMATTING) ||
(operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD)) {
finish_loading_summary(&vio->completion);
return;
}
vdo_submit_metadata_vio(vio, depot->summary_origin, load_summary_endio,
handle_combining_error, REQ_OP_READ);
}
static void load_allocator(void *context, zone_count_t zone_number,
struct vdo_completion *parent)
{
struct slab_depot *depot = context;
vdo_start_loading(&depot->allocators[zone_number].state,
vdo_get_current_manager_operation(depot->action_manager),
parent, initiate_load);
}
void vdo_load_slab_depot(struct slab_depot *depot,
const struct admin_state_code *operation,
struct vdo_completion *parent, void *context)
{
if (!vdo_assert_load_operation(operation, parent))
return;
vdo_schedule_operation_with_context(depot->action_manager, operation,
load_slab_summary, load_allocator,
NULL, context, parent);
}
static void prepare_to_allocate(void *context, zone_count_t zone_number,
struct vdo_completion *parent)
{
struct slab_depot *depot = context;
struct block_allocator *allocator = &depot->allocators[zone_number];
int result;
result = vdo_prepare_slabs_for_allocation(allocator);
if (result != VDO_SUCCESS) {
vdo_fail_completion(parent, result);
return;
}
scrub_slabs(allocator, parent);
}
void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot,
enum slab_depot_load_type load_type,
struct vdo_completion *parent)
{
depot->load_type = load_type;
atomic_set(&depot->zones_to_scrub, depot->zone_count);
vdo_schedule_action(depot->action_manager, NULL,
prepare_to_allocate, NULL, parent);
}
void vdo_update_slab_depot_size(struct slab_depot *depot)
{
depot->last_block = depot->new_last_block;
}
int vdo_prepare_to_grow_slab_depot(struct slab_depot *depot,
const struct partition *partition)
{
struct slab_depot_state_2_0 new_state;
int result;
slab_count_t new_slab_count;
if ((partition->count >> depot->slab_size_shift) <= depot->slab_count)
return VDO_INCREMENT_TOO_SMALL;
VDO_ASSERT_LOG_ONLY(depot->first_block == partition->offset,
"New slab depot partition doesn't change origin");
result = vdo_configure_slab_depot(partition, depot->slab_config,
depot->zone_count, &new_state);
if (result != VDO_SUCCESS)
return result;
new_slab_count = vdo_compute_slab_count(depot->first_block,
new_state.last_block,
depot->slab_size_shift);
if (new_slab_count <= depot->slab_count)
return vdo_log_error_strerror(VDO_INCREMENT_TOO_SMALL,
"Depot can only grow");
if (new_slab_count == depot->new_slab_count) {
return VDO_SUCCESS;
}
vdo_abandon_new_slabs(depot);
result = allocate_slabs(depot, new_slab_count);
if (result != VDO_SUCCESS) {
vdo_abandon_new_slabs(depot);
return result;
}
depot->new_size = partition->count;
depot->old_last_block = depot->last_block;
depot->new_last_block = new_state.last_block;
return VDO_SUCCESS;
}
static int finish_registration(void *context)
{
struct slab_depot *depot = context;
WRITE_ONCE(depot->slab_count, depot->new_slab_count);
vdo_free(depot->slabs);
depot->slabs = depot->new_slabs;
depot->new_slabs = NULL;
depot->new_slab_count = 0;
return VDO_SUCCESS;
}
static void register_new_slabs(void *context, zone_count_t zone_number,
struct vdo_completion *parent)
{
struct slab_depot *depot = context;
struct block_allocator *allocator = &depot->allocators[zone_number];
slab_count_t i;
for (i = depot->slab_count; i < depot->new_slab_count; i++) {
struct vdo_slab *slab = depot->new_slabs[i];
if (slab->allocator == allocator)
register_slab_with_allocator(allocator, slab);
}
vdo_finish_completion(parent);
}
void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent)
{
VDO_ASSERT_LOG_ONLY(depot->new_slabs != NULL, "Must have new slabs to use");
vdo_schedule_operation(depot->action_manager,
VDO_ADMIN_STATE_SUSPENDED_OPERATION,
NULL, register_new_slabs,
finish_registration, parent);
}
static void stop_scrubbing(struct block_allocator *allocator)
{
struct slab_scrubber *scrubber = &allocator->scrubber;
if (vdo_is_state_quiescent(&scrubber->admin_state)) {
vdo_finish_completion(&allocator->completion);
} else {
vdo_start_draining(&scrubber->admin_state,
VDO_ADMIN_STATE_SUSPENDING,
&allocator->completion, NULL);
}
}
static void initiate_summary_drain(struct admin_state *state)
{
check_summary_drain_complete(container_of(state, struct block_allocator,
summary_state));
}
static void do_drain_step(struct vdo_completion *completion)
{
struct block_allocator *allocator = vdo_as_block_allocator(completion);
vdo_prepare_completion_for_requeue(&allocator->completion, do_drain_step,
handle_operation_error, allocator->thread_id,
NULL);
switch (++allocator->drain_step) {
case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER:
stop_scrubbing(allocator);
return;
case VDO_DRAIN_ALLOCATOR_STEP_SLABS:
apply_to_slabs(allocator, do_drain_step);
return;
case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY:
vdo_start_draining(&allocator->summary_state,
vdo_get_admin_state_code(&allocator->state),
completion, initiate_summary_drain);
return;
case VDO_DRAIN_ALLOCATOR_STEP_FINISHED:
VDO_ASSERT_LOG_ONLY(!is_vio_pool_busy(allocator->vio_pool),
"vio pool not busy");
vdo_finish_draining_with_result(&allocator->state, completion->result);
return;
default:
vdo_finish_draining_with_result(&allocator->state, UDS_BAD_STATE);
}
}
static void initiate_drain(struct admin_state *state)
{
struct block_allocator *allocator =
container_of(state, struct block_allocator, state);
allocator->drain_step = VDO_DRAIN_ALLOCATOR_START;
do_drain_step(&allocator->completion);
}
static void drain_allocator(void *context, zone_count_t zone_number,
struct vdo_completion *parent)
{
struct slab_depot *depot = context;
vdo_start_draining(&depot->allocators[zone_number].state,
vdo_get_current_manager_operation(depot->action_manager),
parent, initiate_drain);
}
void vdo_drain_slab_depot(struct slab_depot *depot,
const struct admin_state_code *operation,
struct vdo_completion *parent)
{
vdo_schedule_operation(depot->action_manager, operation,
NULL, drain_allocator, NULL, parent);
}
static void resume_scrubbing(struct block_allocator *allocator)
{
int result;
struct slab_scrubber *scrubber = &allocator->scrubber;
if (!has_slabs_to_scrub(scrubber)) {
vdo_finish_completion(&allocator->completion);
return;
}
result = vdo_resume_if_quiescent(&scrubber->admin_state);
if (result != VDO_SUCCESS) {
vdo_fail_completion(&allocator->completion, result);
return;
}
scrub_next_slab(scrubber);
vdo_finish_completion(&allocator->completion);
}
static void do_resume_step(struct vdo_completion *completion)
{
struct block_allocator *allocator = vdo_as_block_allocator(completion);
vdo_prepare_completion_for_requeue(&allocator->completion, do_resume_step,
handle_operation_error,
allocator->thread_id, NULL);
switch (--allocator->drain_step) {
case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY:
vdo_fail_completion(completion,
vdo_resume_if_quiescent(&allocator->summary_state));
return;
case VDO_DRAIN_ALLOCATOR_STEP_SLABS:
apply_to_slabs(allocator, do_resume_step);
return;
case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER:
resume_scrubbing(allocator);
return;
case VDO_DRAIN_ALLOCATOR_START:
vdo_finish_resuming_with_result(&allocator->state, completion->result);
return;
default:
vdo_finish_resuming_with_result(&allocator->state, UDS_BAD_STATE);
}
}
static void initiate_resume(struct admin_state *state)
{
struct block_allocator *allocator =
container_of(state, struct block_allocator, state);
allocator->drain_step = VDO_DRAIN_ALLOCATOR_STEP_FINISHED;
do_resume_step(&allocator->completion);
}
static void resume_allocator(void *context, zone_count_t zone_number,
struct vdo_completion *parent)
{
struct slab_depot *depot = context;
vdo_start_resuming(&depot->allocators[zone_number].state,
vdo_get_current_manager_operation(depot->action_manager),
parent, initiate_resume);
}
void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent)
{
if (vdo_is_read_only(depot->vdo)) {
vdo_continue_completion(parent, VDO_READ_ONLY);
return;
}
vdo_schedule_operation(depot->action_manager, VDO_ADMIN_STATE_RESUMING,
NULL, resume_allocator, NULL, parent);
}
void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot,
sequence_number_t recovery_block_number)
{
if (depot == NULL)
return;
depot->new_release_request = recovery_block_number;
vdo_schedule_default_action(depot->action_manager);
}
static void scrub_all_unrecovered_slabs(void *context, zone_count_t zone_number,
struct vdo_completion *parent)
{
struct slab_depot *depot = context;
scrub_slabs(&depot->allocators[zone_number], NULL);
vdo_launch_completion(parent);
}
void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot,
struct vdo_completion *parent)
{
vdo_schedule_action(depot->action_manager, NULL,
scrub_all_unrecovered_slabs,
NULL, parent);
}
static struct block_allocator_statistics __must_check
get_block_allocator_statistics(const struct slab_depot *depot)
{
struct block_allocator_statistics totals;
zone_count_t zone;
memset(&totals, 0, sizeof(totals));
for (zone = 0; zone < depot->zone_count; zone++) {
const struct block_allocator *allocator = &depot->allocators[zone];
const struct block_allocator_statistics *stats = &allocator->statistics;
totals.slab_count += allocator->slab_count;
totals.slabs_opened += READ_ONCE(stats->slabs_opened);
totals.slabs_reopened += READ_ONCE(stats->slabs_reopened);
}
return totals;
}
static struct ref_counts_statistics __must_check
get_ref_counts_statistics(const struct slab_depot *depot)
{
struct ref_counts_statistics totals;
zone_count_t zone;
memset(&totals, 0, sizeof(totals));
for (zone = 0; zone < depot->zone_count; zone++) {
totals.blocks_written +=
READ_ONCE(depot->allocators[zone].ref_counts_statistics.blocks_written);
}
return totals;
}
static struct slab_journal_statistics __must_check
get_slab_journal_statistics(const struct slab_depot *depot)
{
struct slab_journal_statistics totals;
zone_count_t zone;
memset(&totals, 0, sizeof(totals));
for (zone = 0; zone < depot->zone_count; zone++) {
const struct slab_journal_statistics *stats =
&depot->allocators[zone].slab_journal_statistics;
totals.disk_full_count += READ_ONCE(stats->disk_full_count);
totals.flush_count += READ_ONCE(stats->flush_count);
totals.blocked_count += READ_ONCE(stats->blocked_count);
totals.blocks_written += READ_ONCE(stats->blocks_written);
totals.tail_busy_count += READ_ONCE(stats->tail_busy_count);
}
return totals;
}
void vdo_get_slab_depot_statistics(const struct slab_depot *depot,
struct vdo_statistics *stats)
{
slab_count_t slab_count = READ_ONCE(depot->slab_count);
slab_count_t unrecovered = 0;
zone_count_t zone;
for (zone = 0; zone < depot->zone_count; zone++) {
unrecovered += READ_ONCE(depot->allocators[zone].scrubber.slab_count);
}
stats->recovery_percentage = (slab_count - unrecovered) * 100 / slab_count;
stats->allocator = get_block_allocator_statistics(depot);
stats->ref_counts = get_ref_counts_statistics(depot);
stats->slab_journal = get_slab_journal_statistics(depot);
stats->slab_summary = (struct slab_summary_statistics) {
.blocks_written = atomic64_read(&depot->summary_statistics.blocks_written),
};
}
void vdo_dump_slab_depot(const struct slab_depot *depot)
{
vdo_log_info("vdo slab depot");
vdo_log_info(" zone_count=%u old_zone_count=%u slabCount=%u active_release_request=%llu new_release_request=%llu",
(unsigned int) depot->zone_count,
(unsigned int) depot->old_zone_count, READ_ONCE(depot->slab_count),
(unsigned long long) depot->active_release_request,
(unsigned long long) depot->new_release_request);
}