root/drivers/md/dm-vdo/physical-zone.c
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2023 Red Hat
 */

#include "physical-zone.h"

#include <linux/list.h>

#include "logger.h"
#include "memory-alloc.h"
#include "permassert.h"

#include "block-map.h"
#include "completion.h"
#include "constants.h"
#include "data-vio.h"
#include "dedupe.h"
#include "encodings.h"
#include "flush.h"
#include "int-map.h"
#include "slab-depot.h"
#include "status-codes.h"
#include "vdo.h"

/* Each user data_vio needs a PBN read lock and write lock. */
#define LOCK_POOL_CAPACITY (2 * MAXIMUM_VDO_USER_VIOS)

struct pbn_lock_implementation {
        enum pbn_lock_type type;
        const char *name;
        const char *release_reason;
};

/* This array must have an entry for every pbn_lock_type value. */
static const struct pbn_lock_implementation LOCK_IMPLEMENTATIONS[] = {
        [VIO_READ_LOCK] = {
                .type = VIO_READ_LOCK,
                .name = "read",
                .release_reason = "candidate duplicate",
        },
        [VIO_WRITE_LOCK] = {
                .type = VIO_WRITE_LOCK,
                .name = "write",
                .release_reason = "newly allocated",
        },
        [VIO_BLOCK_MAP_WRITE_LOCK] = {
                .type = VIO_BLOCK_MAP_WRITE_LOCK,
                .name = "block map write",
                .release_reason = "block map write",
        },
};

static inline bool has_lock_type(const struct pbn_lock *lock, enum pbn_lock_type type)
{
        return (lock->implementation == &LOCK_IMPLEMENTATIONS[type]);
}

/**
 * vdo_is_pbn_read_lock() - Check whether a pbn_lock is a read lock.
 * @lock: The lock to check.
 *
 * Return: True if the lock is a read lock.
 */
bool vdo_is_pbn_read_lock(const struct pbn_lock *lock)
{
        return has_lock_type(lock, VIO_READ_LOCK);
}

static inline void set_pbn_lock_type(struct pbn_lock *lock, enum pbn_lock_type type)
{
        lock->implementation = &LOCK_IMPLEMENTATIONS[type];
}

/**
 * vdo_downgrade_pbn_write_lock() - Downgrade a PBN write lock to a PBN read lock.
 * @lock: The PBN write lock to downgrade.
 * @compressed_write: True if the written block was a compressed block.
 *
 * The lock holder count is cleared and the caller is responsible for setting the new count.
 */
void vdo_downgrade_pbn_write_lock(struct pbn_lock *lock, bool compressed_write)
{
        VDO_ASSERT_LOG_ONLY(!vdo_is_pbn_read_lock(lock),
                            "PBN lock must not already have been downgraded");
        VDO_ASSERT_LOG_ONLY(!has_lock_type(lock, VIO_BLOCK_MAP_WRITE_LOCK),
                            "must not downgrade block map write locks");
        VDO_ASSERT_LOG_ONLY(lock->holder_count == 1,
                            "PBN write lock should have one holder but has %u",
                            lock->holder_count);
        /*
         * data_vio write locks are downgraded in place--the writer retains the hold on the lock.
         * If this was a compressed write, the holder has not yet journaled its own inc ref,
         * otherwise, it has.
         */
        lock->increment_limit =
                (compressed_write ? MAXIMUM_REFERENCE_COUNT : MAXIMUM_REFERENCE_COUNT - 1);
        set_pbn_lock_type(lock, VIO_READ_LOCK);
}

/**
 * vdo_claim_pbn_lock_increment() - Try to claim one of the available reference count increments on
 *                                  a read lock.
 * @lock: The PBN read lock from which to claim an increment.
 *
 * Claims may be attempted from any thread. A claim is only valid until the PBN lock is released.
 *
 * Return: true if the claim succeeded, guaranteeing one increment can be made without overflowing
 *         the PBN's reference count.
 */
bool vdo_claim_pbn_lock_increment(struct pbn_lock *lock)
{
        /*
         * Claim the next free reference atomically since hash locks from multiple hash zone
         * threads might be concurrently deduplicating against a single PBN lock on compressed
         * block. As long as hitting the increment limit will lead to the PBN lock being released
         * in a sane time-frame, we won't overflow a 32-bit claim counter, allowing a simple add
         * instead of a compare-and-swap.
         */
        u32 claim_number = (u32) atomic_add_return(1, &lock->increments_claimed);

        return (claim_number <= lock->increment_limit);
}

/**
 * vdo_assign_pbn_lock_provisional_reference() - Inform a PBN lock that it is responsible for a
 *                                               provisional reference.
 * @lock: The PBN lock.
 */
void vdo_assign_pbn_lock_provisional_reference(struct pbn_lock *lock)
{
        VDO_ASSERT_LOG_ONLY(!lock->has_provisional_reference,
                            "lock does not have a provisional reference");
        lock->has_provisional_reference = true;
}

/**
 * vdo_unassign_pbn_lock_provisional_reference() - Inform a PBN lock that it is no longer
 *                                                 responsible for a provisional reference.
 * @lock: The PBN lock.
 */
void vdo_unassign_pbn_lock_provisional_reference(struct pbn_lock *lock)
{
        lock->has_provisional_reference = false;
}

/**
 * release_pbn_lock_provisional_reference() - If the lock is responsible for a provisional
 *                                            reference, release that reference.
 * @lock: The lock.
 * @locked_pbn: The PBN covered by the lock.
 * @allocator: The block allocator from which to release the reference.
 *
 * This method is called when the lock is released.
 */
static void release_pbn_lock_provisional_reference(struct pbn_lock *lock,
                                                   physical_block_number_t locked_pbn,
                                                   struct block_allocator *allocator)
{
        int result;

        if (!vdo_pbn_lock_has_provisional_reference(lock))
                return;

        result = vdo_release_block_reference(allocator, locked_pbn);
        if (result != VDO_SUCCESS) {
                vdo_log_error_strerror(result,
                                       "Failed to release reference to %s physical block %llu",
                                       lock->implementation->release_reason,
                                       (unsigned long long) locked_pbn);
        }

        vdo_unassign_pbn_lock_provisional_reference(lock);
}

/**
 * union idle_pbn_lock - PBN lock list entries.
 *
 * Unused (idle) PBN locks are kept in a list. Just like in a malloc implementation, the lock
 * structure is unused memory, so we can save a bit of space (and not pollute the lock structure
 * proper) by using a union to overlay the lock structure with the free list.
 */
typedef union {
        /** @entry: Only used while locks are in the pool. */
        struct list_head entry;
        /** @lock: Only used while locks are not in the pool. */
        struct pbn_lock lock;
} idle_pbn_lock;

/**
 * struct pbn_lock_pool - list of PBN locks.
 *
 * The lock pool is little more than the memory allocated for the locks.
 */
struct pbn_lock_pool {
        /** @capacity: The number of locks allocated for the pool. */
        size_t capacity;
        /** @borrowed: The number of locks currently borrowed from the pool. */
        size_t borrowed;
        /** @idle_list: A list containing all idle PBN lock instances. */
        struct list_head idle_list;
        /** @locks: The memory for all the locks allocated by this pool. */
        idle_pbn_lock locks[];
};

/**
 * return_pbn_lock_to_pool() - Return a pbn lock to its pool.
 * @pool: The pool from which the lock was borrowed.
 * @lock: The last reference to the lock being returned.
 *
 * It must be the last live reference, as if the memory were being freed (the lock memory will
 * re-initialized or zeroed).
 */
static void return_pbn_lock_to_pool(struct pbn_lock_pool *pool, struct pbn_lock *lock)
{
        idle_pbn_lock *idle;

        /* A bit expensive, but will promptly catch some use-after-free errors. */
        memset(lock, 0, sizeof(*lock));

        idle = container_of(lock, idle_pbn_lock, lock);
        INIT_LIST_HEAD(&idle->entry);
        list_add_tail(&idle->entry, &pool->idle_list);

        VDO_ASSERT_LOG_ONLY(pool->borrowed > 0, "shouldn't return more than borrowed");
        pool->borrowed -= 1;
}

/**
 * make_pbn_lock_pool() - Create a new PBN lock pool and all the lock instances it can loan out.
 *
 * @capacity: The number of PBN locks to allocate for the pool.
 * @pool_ptr: A pointer to receive the new pool.
 *
 * Return: VDO_SUCCESS or an error code.
 */
static int make_pbn_lock_pool(size_t capacity, struct pbn_lock_pool **pool_ptr)
{
        size_t i;
        struct pbn_lock_pool *pool;
        int result;

        result = vdo_allocate_extended(struct pbn_lock_pool, capacity, idle_pbn_lock,
                                       __func__, &pool);
        if (result != VDO_SUCCESS)
                return result;

        pool->capacity = capacity;
        pool->borrowed = capacity;
        INIT_LIST_HEAD(&pool->idle_list);

        for (i = 0; i < capacity; i++)
                return_pbn_lock_to_pool(pool, &pool->locks[i].lock);

        *pool_ptr = pool;
        return VDO_SUCCESS;
}

/**
 * free_pbn_lock_pool() - Free a PBN lock pool.
 * @pool: The lock pool to free.
 *
 * This also frees all the PBN locks it allocated, so the caller must ensure that all locks have
 * been returned to the pool.
 */
static void free_pbn_lock_pool(struct pbn_lock_pool *pool)
{
        if (pool == NULL)
                return;

        VDO_ASSERT_LOG_ONLY(pool->borrowed == 0,
                            "All PBN locks must be returned to the pool before it is freed, but %zu locks are still on loan",
                            pool->borrowed);
        vdo_free(pool);
}

/**
 * borrow_pbn_lock_from_pool() - Borrow a PBN lock from the pool and initialize it with the
 *                               provided type.
 * @pool: The pool from which to borrow.
 * @type: The type with which to initialize the lock.
 * @lock_ptr:  A pointer to receive the borrowed lock.
 *
 * Pools do not grow on demand or allocate memory, so this will fail if the pool is empty. Borrowed
 * locks are still associated with this pool and must be returned to only this pool.
 *
 * Return: VDO_SUCCESS, or VDO_LOCK_ERROR if the pool is empty.
 */
static int __must_check borrow_pbn_lock_from_pool(struct pbn_lock_pool *pool,
                                                  enum pbn_lock_type type,
                                                  struct pbn_lock **lock_ptr)
{
        int result;
        struct list_head *idle_entry;
        idle_pbn_lock *idle;

        if (pool->borrowed >= pool->capacity)
                return vdo_log_error_strerror(VDO_LOCK_ERROR,
                                              "no free PBN locks left to borrow");
        pool->borrowed += 1;

        result = VDO_ASSERT(!list_empty(&pool->idle_list),
                            "idle list should not be empty if pool not at capacity");
        if (result != VDO_SUCCESS)
                return result;

        idle_entry = pool->idle_list.prev;
        list_del(idle_entry);
        memset(idle_entry, 0, sizeof(*idle_entry));

        idle = list_entry(idle_entry, idle_pbn_lock, entry);
        idle->lock.holder_count = 0;
        set_pbn_lock_type(&idle->lock, type);

        *lock_ptr = &idle->lock;
        return VDO_SUCCESS;
}

/**
 * initialize_zone() - Initialize a physical zone.
 * @vdo: The vdo to which the zone will belong.
 * @zones: The physical_zones to which the zone being initialized belongs
 *
 * Return: VDO_SUCCESS or an error code.
 */
static int initialize_zone(struct vdo *vdo, struct physical_zones *zones)
{
        int result;
        zone_count_t zone_number = zones->zone_count;
        struct physical_zone *zone = &zones->zones[zone_number];

        result = vdo_int_map_create(VDO_LOCK_MAP_CAPACITY, &zone->pbn_operations);
        if (result != VDO_SUCCESS)
                return result;

        result = make_pbn_lock_pool(LOCK_POOL_CAPACITY, &zone->lock_pool);
        if (result != VDO_SUCCESS) {
                vdo_int_map_free(zone->pbn_operations);
                return result;
        }

        zone->zone_number = zone_number;
        zone->thread_id = vdo->thread_config.physical_threads[zone_number];
        zone->allocator = &vdo->depot->allocators[zone_number];
        zone->next = &zones->zones[(zone_number + 1) % vdo->thread_config.physical_zone_count];
        result = vdo_make_default_thread(vdo, zone->thread_id);
        if (result != VDO_SUCCESS) {
                free_pbn_lock_pool(vdo_forget(zone->lock_pool));
                vdo_int_map_free(zone->pbn_operations);
                return result;
        }
        return result;
}

/**
 * vdo_make_physical_zones() - Make the physical zones for a vdo.
 * @vdo: The vdo being constructed
 * @zones_ptr: A pointer to hold the zones
 *
 * Return: VDO_SUCCESS or an error code.
 */
int vdo_make_physical_zones(struct vdo *vdo, struct physical_zones **zones_ptr)
{
        struct physical_zones *zones;
        int result;
        zone_count_t zone_count = vdo->thread_config.physical_zone_count;

        if (zone_count == 0)
                return VDO_SUCCESS;

        result = vdo_allocate_extended(struct physical_zones, zone_count,
                                       struct physical_zone, __func__, &zones);
        if (result != VDO_SUCCESS)
                return result;

        for (zones->zone_count = 0; zones->zone_count < zone_count; zones->zone_count++) {
                result = initialize_zone(vdo, zones);
                if (result != VDO_SUCCESS) {
                        vdo_free_physical_zones(zones);
                        return result;
                }
        }

        *zones_ptr = zones;
        return VDO_SUCCESS;
}

/**
 * vdo_free_physical_zones() - Destroy the physical zones.
 * @zones: The zones to free.
 */
void vdo_free_physical_zones(struct physical_zones *zones)
{
        zone_count_t index;

        if (zones == NULL)
                return;

        for (index = 0; index < zones->zone_count; index++) {
                struct physical_zone *zone = &zones->zones[index];

                free_pbn_lock_pool(vdo_forget(zone->lock_pool));
                vdo_int_map_free(vdo_forget(zone->pbn_operations));
        }

        vdo_free(zones);
}

/**
 * vdo_get_physical_zone_pbn_lock() - Get the lock on a PBN if one exists.
 * @zone: The physical zone responsible for the PBN.
 * @pbn: The physical block number whose lock is desired.
 *
 * Return: The lock or NULL if the PBN is not locked.
 */
struct pbn_lock *vdo_get_physical_zone_pbn_lock(struct physical_zone *zone,
                                                physical_block_number_t pbn)
{
        return ((zone == NULL) ? NULL : vdo_int_map_get(zone->pbn_operations, pbn));
}

/**
 * vdo_attempt_physical_zone_pbn_lock() - Attempt to lock a physical block in the zone responsible
 *                                        for it.
 * @zone: The physical zone responsible for the PBN.
 * @pbn: The physical block number to lock.
 * @type: The type with which to initialize a new lock.
 * @lock_ptr:  A pointer to receive the lock, existing or new.
 *
 * If the PBN is already locked, the existing lock will be returned. Otherwise, a new lock instance
 * will be borrowed from the pool, initialized, and returned. The lock owner will be NULL for a new
 * lock acquired by the caller, who is responsible for setting that field promptly. The lock owner
 * will be non-NULL when there is already an existing lock on the PBN.
 *
 * Return: VDO_SUCCESS or an error.
 */
int vdo_attempt_physical_zone_pbn_lock(struct physical_zone *zone,
                                       physical_block_number_t pbn,
                                       enum pbn_lock_type type,
                                       struct pbn_lock **lock_ptr)
{
        /*
         * Borrow and prepare a lock from the pool so we don't have to do two int_map accesses in
         * the common case of no lock contention.
         */
        struct pbn_lock *lock, *new_lock = NULL;
        int result;

        result = borrow_pbn_lock_from_pool(zone->lock_pool, type, &new_lock);
        if (result != VDO_SUCCESS) {
                VDO_ASSERT_LOG_ONLY(false, "must always be able to borrow a PBN lock");
                return result;
        }

        result = vdo_int_map_put(zone->pbn_operations, pbn, new_lock, false,
                                 (void **) &lock);
        if (result != VDO_SUCCESS) {
                return_pbn_lock_to_pool(zone->lock_pool, new_lock);
                return result;
        }

        if (lock != NULL) {
                /* The lock is already held, so we don't need the borrowed one. */
                return_pbn_lock_to_pool(zone->lock_pool, vdo_forget(new_lock));
                result = VDO_ASSERT(lock->holder_count > 0, "physical block %llu lock held",
                                    (unsigned long long) pbn);
                if (result != VDO_SUCCESS)
                        return result;
                *lock_ptr = lock;
        } else {
                *lock_ptr = new_lock;
        }
        return VDO_SUCCESS;
}

/**
 * allocate_and_lock_block() - Attempt to allocate a block from this zone.
 * @allocation: The struct allocation of the data_vio attempting to allocate.
 *
 * If a block is allocated, the recipient will also hold a lock on it.
 *
 * Return: VDO_SUCCESS if a block was allocated, or an error code.
 */
static int allocate_and_lock_block(struct allocation *allocation)
{
        int result;
        struct pbn_lock *lock;

        VDO_ASSERT_LOG_ONLY(allocation->lock == NULL,
                            "must not allocate a block while already holding a lock on one");

        result = vdo_allocate_block(allocation->zone->allocator, &allocation->pbn);
        if (result != VDO_SUCCESS)
                return result;

        result = vdo_attempt_physical_zone_pbn_lock(allocation->zone, allocation->pbn,
                                                    allocation->write_lock_type, &lock);
        if (result != VDO_SUCCESS)
                return result;

        if (lock->holder_count > 0) {
                /* This block is already locked, which should be impossible. */
                return vdo_log_error_strerror(VDO_LOCK_ERROR,
                                              "Newly allocated block %llu was spuriously locked (holder_count=%u)",
                                              (unsigned long long) allocation->pbn,
                                              lock->holder_count);
        }

        /* We've successfully acquired a new lock, so mark it as ours. */
        lock->holder_count += 1;
        allocation->lock = lock;
        vdo_assign_pbn_lock_provisional_reference(lock);
        return VDO_SUCCESS;
}

/**
 * retry_allocation() - Retry allocating a block now that we're done waiting for scrubbing.
 * @waiter: The allocating_vio that was waiting to allocate.
 * @context: The context (unused).
 */
static void retry_allocation(struct vdo_waiter *waiter, void __always_unused *context)
{
        struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);

        /* Now that some slab has scrubbed, restart the allocation process. */
        data_vio->allocation.wait_for_clean_slab = false;
        data_vio->allocation.first_allocation_zone = data_vio->allocation.zone->zone_number;
        continue_data_vio(data_vio);
}

/**
 * continue_allocating() - Continue searching for an allocation by enqueuing to wait for scrubbing
 *                         or switching to the next zone.
 * @data_vio: The data_vio attempting to get an allocation.
 *
 * This method should only be called from the error handler set in data_vio_allocate_data_block.
 *
 * Return: true if the allocation process has continued in another zone.
 */
static bool continue_allocating(struct data_vio *data_vio)
{
        struct allocation *allocation = &data_vio->allocation;
        struct physical_zone *zone = allocation->zone;
        struct vdo_completion *completion = &data_vio->vio.completion;
        int result = VDO_SUCCESS;
        bool was_waiting = allocation->wait_for_clean_slab;
        bool tried_all = (allocation->first_allocation_zone == zone->next->zone_number);

        vdo_reset_completion(completion);

        if (tried_all && !was_waiting) {
                /*
                 * We've already looked in all the zones, and found nothing. So go through the
                 * zones again, and wait for each to scrub before trying to allocate.
                 */
                allocation->wait_for_clean_slab = true;
                allocation->first_allocation_zone = zone->zone_number;
        }

        if (allocation->wait_for_clean_slab) {
                data_vio->waiter.callback = retry_allocation;
                result = vdo_enqueue_clean_slab_waiter(zone->allocator,
                                                       &data_vio->waiter);
                if (result == VDO_SUCCESS) {
                        /* We've enqueued to wait for a slab to be scrubbed. */
                        return true;
                }

                if ((result != VDO_NO_SPACE) || (was_waiting && tried_all)) {
                        vdo_set_completion_result(completion, result);
                        return false;
                }
        }

        allocation->zone = zone->next;
        completion->callback_thread_id = allocation->zone->thread_id;
        vdo_launch_completion(completion);
        return true;
}

/**
 * vdo_allocate_block_in_zone() - Attempt to allocate a block in the current physical zone, and if
 *                                that fails try the next if possible.
 * @data_vio: The data_vio needing an allocation.
 *
 * Return: True if a block was allocated, if not the data_vio will have been dispatched so the
 *         caller must not touch it.
 */
bool vdo_allocate_block_in_zone(struct data_vio *data_vio)
{
        int result = allocate_and_lock_block(&data_vio->allocation);

        if (result == VDO_SUCCESS)
                return true;

        if ((result != VDO_NO_SPACE) || !continue_allocating(data_vio))
                continue_data_vio_with_error(data_vio, result);

        return false;
}

/**
 * vdo_release_physical_zone_pbn_lock() - Release a physical block lock if it is held and return it
 *                                        to the lock pool.
 * @zone: The physical zone in which the lock was obtained.
 * @locked_pbn: The physical block number to unlock.
 * @lock: The lock being released.
 *
 * It must be the last live reference, as if the memory were being freed (the
 * lock memory will re-initialized or zeroed).
 */
void vdo_release_physical_zone_pbn_lock(struct physical_zone *zone,
                                        physical_block_number_t locked_pbn,
                                        struct pbn_lock *lock)
{
        struct pbn_lock *holder;

        if (lock == NULL)
                return;

        VDO_ASSERT_LOG_ONLY(lock->holder_count > 0,
                            "should not be releasing a lock that is not held");

        lock->holder_count -= 1;
        if (lock->holder_count > 0) {
                /* The lock was shared and is still referenced, so don't release it yet. */
                return;
        }

        holder = vdo_int_map_remove(zone->pbn_operations, locked_pbn);
        VDO_ASSERT_LOG_ONLY((lock == holder), "physical block lock mismatch for block %llu",
                            (unsigned long long) locked_pbn);

        release_pbn_lock_provisional_reference(lock, locked_pbn, zone->allocator);
        return_pbn_lock_to_pool(zone->lock_pool, lock);
}

/**
 * vdo_dump_physical_zone() - Dump information about a physical zone to the log for debugging.
 * @zone: The zone to dump.
 */
void vdo_dump_physical_zone(const struct physical_zone *zone)
{
        vdo_dump_block_allocator(zone->allocator);
}