root/drivers/accel/rocket/rocket_job.c
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright 2019 Linaro, Ltd, Rob Herring <robh@kernel.org> */
/* Copyright 2019 Collabora ltd. */
/* Copyright 2024-2025 Tomeu Vizoso <tomeu@tomeuvizoso.net> */

#include <drm/drm_print.h>
#include <drm/drm_file.h>
#include <drm/drm_gem.h>
#include <drm/rocket_accel.h>
#include <linux/interrupt.h>
#include <linux/iommu.h>
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>

#include "rocket_core.h"
#include "rocket_device.h"
#include "rocket_drv.h"
#include "rocket_job.h"
#include "rocket_registers.h"

#define JOB_TIMEOUT_MS 500

static struct rocket_job *
to_rocket_job(struct drm_sched_job *sched_job)
{
        return container_of(sched_job, struct rocket_job, base);
}

static const char *rocket_fence_get_driver_name(struct dma_fence *fence)
{
        return "rocket";
}

static const char *rocket_fence_get_timeline_name(struct dma_fence *fence)
{
        return "rockchip-npu";
}

static const struct dma_fence_ops rocket_fence_ops = {
        .get_driver_name = rocket_fence_get_driver_name,
        .get_timeline_name = rocket_fence_get_timeline_name,
};

static struct dma_fence *rocket_fence_create(struct rocket_core *core)
{
        struct dma_fence *fence;

        fence = kzalloc_obj(*fence);
        if (!fence)
                return ERR_PTR(-ENOMEM);

        dma_fence_init(fence, &rocket_fence_ops, &core->fence_lock,
                       core->fence_context, ++core->emit_seqno);

        return fence;
}

static int
rocket_copy_tasks(struct drm_device *dev,
                  struct drm_file *file_priv,
                  struct drm_rocket_job *job,
                  struct rocket_job *rjob)
{
        int ret = 0;

        if (job->task_struct_size < sizeof(struct drm_rocket_task))
                return -EINVAL;

        rjob->task_count = job->task_count;

        if (!rjob->task_count)
                return 0;

        rjob->tasks = kvmalloc_objs(*rjob->tasks, job->task_count);
        if (!rjob->tasks) {
                drm_dbg(dev, "Failed to allocate task array\n");
                return -ENOMEM;
        }

        for (int i = 0; i < rjob->task_count; i++) {
                struct drm_rocket_task task = {0};

                if (copy_from_user(&task,
                                   u64_to_user_ptr(job->tasks) + i * job->task_struct_size,
                                   sizeof(task))) {
                        drm_dbg(dev, "Failed to copy incoming tasks\n");
                        ret = -EFAULT;
                        goto fail;
                }

                if (task.regcmd_count == 0) {
                        drm_dbg(dev, "regcmd_count field in drm_rocket_task should be > 0.\n");
                        ret = -EINVAL;
                        goto fail;
                }

                rjob->tasks[i].regcmd = task.regcmd;
                rjob->tasks[i].regcmd_count = task.regcmd_count;
        }

        return 0;

fail:
        kvfree(rjob->tasks);
        return ret;
}

static void rocket_job_hw_submit(struct rocket_core *core, struct rocket_job *job)
{
        struct rocket_task *task;
        unsigned int extra_bit;

        /* Don't queue the job if a reset is in progress */
        if (atomic_read(&core->reset.pending))
                return;

        /* GO ! */

        task = &job->tasks[job->next_task_idx];
        job->next_task_idx++;

        rocket_pc_writel(core, BASE_ADDRESS, 0x1);

         /* From rknpu, in the TRM this bit is marked as reserved */
        extra_bit = 0x10000000 * core->index;
        rocket_cna_writel(core, S_POINTER, CNA_S_POINTER_POINTER_PP_EN(1) |
                                           CNA_S_POINTER_EXECUTER_PP_EN(1) |
                                           CNA_S_POINTER_POINTER_PP_MODE(1) |
                                           extra_bit);

        rocket_core_writel(core, S_POINTER, CORE_S_POINTER_POINTER_PP_EN(1) |
                                            CORE_S_POINTER_EXECUTER_PP_EN(1) |
                                            CORE_S_POINTER_POINTER_PP_MODE(1) |
                                            extra_bit);

        rocket_pc_writel(core, BASE_ADDRESS, task->regcmd);
        rocket_pc_writel(core, REGISTER_AMOUNTS,
                         PC_REGISTER_AMOUNTS_PC_DATA_AMOUNT((task->regcmd_count + 1) / 2 - 1));

        rocket_pc_writel(core, INTERRUPT_MASK, PC_INTERRUPT_MASK_DPU_0 | PC_INTERRUPT_MASK_DPU_1);
        rocket_pc_writel(core, INTERRUPT_CLEAR, PC_INTERRUPT_CLEAR_DPU_0 | PC_INTERRUPT_CLEAR_DPU_1);

        rocket_pc_writel(core, TASK_CON, PC_TASK_CON_RESERVED_0(1) |
                                         PC_TASK_CON_TASK_COUNT_CLEAR(1) |
                                         PC_TASK_CON_TASK_NUMBER(1) |
                                         PC_TASK_CON_TASK_PP_EN(1));

        rocket_pc_writel(core, TASK_DMA_BASE_ADDR, PC_TASK_DMA_BASE_ADDR_DMA_BASE_ADDR(0x0));

        rocket_pc_writel(core, OPERATION_ENABLE, PC_OPERATION_ENABLE_OP_EN(1));

        dev_dbg(core->dev, "Submitted regcmd at 0x%llx to core %d", task->regcmd, core->index);
}

static int rocket_acquire_object_fences(struct drm_gem_object **bos,
                                        int bo_count,
                                        struct drm_sched_job *job,
                                        bool is_write)
{
        int i, ret;

        for (i = 0; i < bo_count; i++) {
                ret = dma_resv_reserve_fences(bos[i]->resv, 1);
                if (ret)
                        return ret;

                ret = drm_sched_job_add_implicit_dependencies(job, bos[i],
                                                              is_write);
                if (ret)
                        return ret;
        }

        return 0;
}

static void rocket_attach_object_fences(struct drm_gem_object **bos,
                                        int bo_count,
                                        struct dma_fence *fence)
{
        int i;

        for (i = 0; i < bo_count; i++)
                dma_resv_add_fence(bos[i]->resv, fence, DMA_RESV_USAGE_WRITE);
}

static int rocket_job_push(struct rocket_job *job)
{
        struct rocket_device *rdev = job->rdev;
        struct drm_gem_object **bos;
        struct ww_acquire_ctx acquire_ctx;
        int ret = 0;

        bos = kvmalloc_array(job->in_bo_count + job->out_bo_count, sizeof(void *),
                             GFP_KERNEL);
        memcpy(bos, job->in_bos, job->in_bo_count * sizeof(void *));
        memcpy(&bos[job->in_bo_count], job->out_bos, job->out_bo_count * sizeof(void *));

        ret = drm_gem_lock_reservations(bos, job->in_bo_count + job->out_bo_count, &acquire_ctx);
        if (ret)
                goto err;

        scoped_guard(mutex, &rdev->sched_lock) {
                drm_sched_job_arm(&job->base);

                job->inference_done_fence = dma_fence_get(&job->base.s_fence->finished);

                ret = rocket_acquire_object_fences(job->in_bos, job->in_bo_count, &job->base, false);
                if (ret)
                        goto err_unlock;

                ret = rocket_acquire_object_fences(job->out_bos, job->out_bo_count, &job->base, true);
                if (ret)
                        goto err_unlock;

                kref_get(&job->refcount); /* put by scheduler job completion */

                drm_sched_entity_push_job(&job->base);
        }

        rocket_attach_object_fences(job->out_bos, job->out_bo_count, job->inference_done_fence);

err_unlock:
        drm_gem_unlock_reservations(bos, job->in_bo_count + job->out_bo_count, &acquire_ctx);
err:
        kvfree(bos);

        return ret;
}

static void rocket_job_cleanup(struct kref *ref)
{
        struct rocket_job *job = container_of(ref, struct rocket_job,
                                                refcount);
        unsigned int i;

        rocket_iommu_domain_put(job->domain);

        dma_fence_put(job->done_fence);
        dma_fence_put(job->inference_done_fence);

        if (job->in_bos) {
                for (i = 0; i < job->in_bo_count; i++)
                        drm_gem_object_put(job->in_bos[i]);

                kvfree(job->in_bos);
        }

        if (job->out_bos) {
                for (i = 0; i < job->out_bo_count; i++)
                        drm_gem_object_put(job->out_bos[i]);

                kvfree(job->out_bos);
        }

        kvfree(job->tasks);

        kfree(job);
}

static void rocket_job_put(struct rocket_job *job)
{
        kref_put(&job->refcount, rocket_job_cleanup);
}

static void rocket_job_free(struct drm_sched_job *sched_job)
{
        struct rocket_job *job = to_rocket_job(sched_job);

        drm_sched_job_cleanup(sched_job);

        rocket_job_put(job);
}

static struct rocket_core *sched_to_core(struct rocket_device *rdev,
                                         struct drm_gpu_scheduler *sched)
{
        unsigned int core;

        for (core = 0; core < rdev->num_cores; core++) {
                if (&rdev->cores[core].sched == sched)
                        return &rdev->cores[core];
        }

        return NULL;
}

static struct dma_fence *rocket_job_run(struct drm_sched_job *sched_job)
{
        struct rocket_job *job = to_rocket_job(sched_job);
        struct rocket_device *rdev = job->rdev;
        struct rocket_core *core = sched_to_core(rdev, sched_job->sched);
        struct dma_fence *fence = NULL;
        int ret;

        if (unlikely(job->base.s_fence->finished.error))
                return NULL;

        /*
         * Nothing to execute: can happen if the job has finished while
         * we were resetting the NPU.
         */
        if (job->next_task_idx == job->task_count)
                return NULL;

        fence = rocket_fence_create(core);
        if (IS_ERR(fence))
                return fence;

        if (job->done_fence)
                dma_fence_put(job->done_fence);
        job->done_fence = dma_fence_get(fence);

        ret = pm_runtime_get_sync(core->dev);
        if (ret < 0)
                return fence;

        ret = iommu_attach_group(job->domain->domain, core->iommu_group);
        if (ret < 0)
                return fence;

        scoped_guard(mutex, &core->job_lock) {
                core->in_flight_job = job;
                rocket_job_hw_submit(core, job);
        }

        return fence;
}

static void rocket_job_handle_irq(struct rocket_core *core)
{
        pm_runtime_mark_last_busy(core->dev);

        rocket_pc_writel(core, OPERATION_ENABLE, 0x0);
        rocket_pc_writel(core, INTERRUPT_CLEAR, 0x1ffff);

        scoped_guard(mutex, &core->job_lock)
                if (core->in_flight_job) {
                        if (core->in_flight_job->next_task_idx < core->in_flight_job->task_count) {
                                rocket_job_hw_submit(core, core->in_flight_job);
                                return;
                        }

                        iommu_detach_group(NULL, iommu_group_get(core->dev));
                        dma_fence_signal(core->in_flight_job->done_fence);
                        pm_runtime_put_autosuspend(core->dev);
                        core->in_flight_job = NULL;
                }
}

static void
rocket_reset(struct rocket_core *core, struct drm_sched_job *bad)
{
        if (!atomic_read(&core->reset.pending))
                return;

        drm_sched_stop(&core->sched, bad);

        /*
         * Remaining interrupts have been handled, but we might still have
         * stuck jobs. Let's make sure the PM counters stay balanced by
         * manually calling pm_runtime_put_noidle().
         */
        scoped_guard(mutex, &core->job_lock) {
                if (core->in_flight_job)
                        pm_runtime_put_noidle(core->dev);

                iommu_detach_group(NULL, core->iommu_group);

                core->in_flight_job = NULL;
        }

        /* Proceed with reset now. */
        rocket_core_reset(core);

        /* NPU has been reset, we can clear the reset pending bit. */
        atomic_set(&core->reset.pending, 0);

        /* Restart the scheduler */
        drm_sched_start(&core->sched, 0);
}

static enum drm_gpu_sched_stat rocket_job_timedout(struct drm_sched_job *sched_job)
{
        struct rocket_job *job = to_rocket_job(sched_job);
        struct rocket_device *rdev = job->rdev;
        struct rocket_core *core = sched_to_core(rdev, sched_job->sched);

        dev_err(core->dev, "NPU job timed out");

        atomic_set(&core->reset.pending, 1);
        rocket_reset(core, sched_job);

        return DRM_GPU_SCHED_STAT_RESET;
}

static void rocket_reset_work(struct work_struct *work)
{
        struct rocket_core *core;

        core = container_of(work, struct rocket_core, reset.work);
        rocket_reset(core, NULL);
}

static const struct drm_sched_backend_ops rocket_sched_ops = {
        .run_job = rocket_job_run,
        .timedout_job = rocket_job_timedout,
        .free_job = rocket_job_free
};

static irqreturn_t rocket_job_irq_handler_thread(int irq, void *data)
{
        struct rocket_core *core = data;

        rocket_job_handle_irq(core);

        return IRQ_HANDLED;
}

static irqreturn_t rocket_job_irq_handler(int irq, void *data)
{
        struct rocket_core *core = data;
        u32 raw_status = rocket_pc_readl(core, INTERRUPT_RAW_STATUS);

        WARN_ON(raw_status & PC_INTERRUPT_RAW_STATUS_DMA_READ_ERROR);
        WARN_ON(raw_status & PC_INTERRUPT_RAW_STATUS_DMA_WRITE_ERROR);

        if (!(raw_status & PC_INTERRUPT_RAW_STATUS_DPU_0 ||
              raw_status & PC_INTERRUPT_RAW_STATUS_DPU_1))
                return IRQ_NONE;

        rocket_pc_writel(core, INTERRUPT_MASK, 0x0);

        return IRQ_WAKE_THREAD;
}

int rocket_job_init(struct rocket_core *core)
{
        struct drm_sched_init_args args = {
                .ops = &rocket_sched_ops,
                .num_rqs = DRM_SCHED_PRIORITY_COUNT,
                .credit_limit = 1,
                .timeout = msecs_to_jiffies(JOB_TIMEOUT_MS),
                .name = dev_name(core->dev),
                .dev = core->dev,
        };
        int ret;

        INIT_WORK(&core->reset.work, rocket_reset_work);
        spin_lock_init(&core->fence_lock);
        mutex_init(&core->job_lock);

        core->irq = platform_get_irq(to_platform_device(core->dev), 0);
        if (core->irq < 0)
                return core->irq;

        ret = devm_request_threaded_irq(core->dev, core->irq,
                                        rocket_job_irq_handler,
                                        rocket_job_irq_handler_thread,
                                        IRQF_SHARED, dev_name(core->dev),
                                        core);
        if (ret) {
                dev_err(core->dev, "failed to request job irq");
                return ret;
        }

        core->reset.wq = alloc_ordered_workqueue("rocket-reset-%d", 0, core->index);
        if (!core->reset.wq)
                return -ENOMEM;

        core->fence_context = dma_fence_context_alloc(1);

        args.timeout_wq = core->reset.wq;
        ret = drm_sched_init(&core->sched, &args);
        if (ret) {
                dev_err(core->dev, "Failed to create scheduler: %d.", ret);
                goto err_sched;
        }

        return 0;

err_sched:
        drm_sched_fini(&core->sched);

        destroy_workqueue(core->reset.wq);
        return ret;
}

void rocket_job_fini(struct rocket_core *core)
{
        drm_sched_fini(&core->sched);

        cancel_work_sync(&core->reset.work);
        destroy_workqueue(core->reset.wq);
}

int rocket_job_open(struct rocket_file_priv *rocket_priv)
{
        struct rocket_device *rdev = rocket_priv->rdev;
        struct drm_gpu_scheduler **scheds = kmalloc_objs(*scheds,
                                                         rdev->num_cores);
        unsigned int core;
        int ret;

        for (core = 0; core < rdev->num_cores; core++)
                scheds[core] = &rdev->cores[core].sched;

        ret = drm_sched_entity_init(&rocket_priv->sched_entity,
                                    DRM_SCHED_PRIORITY_NORMAL,
                                    scheds,
                                    rdev->num_cores, NULL);
        if (WARN_ON(ret))
                return ret;

        return 0;
}

void rocket_job_close(struct rocket_file_priv *rocket_priv)
{
        struct drm_sched_entity *entity = &rocket_priv->sched_entity;

        kfree(entity->sched_list);
        drm_sched_entity_destroy(entity);
}

int rocket_job_is_idle(struct rocket_core *core)
{
        /* If there are any jobs in this HW queue, we're not idle */
        if (atomic_read(&core->sched.credit_count))
                return false;

        return true;
}

static int rocket_ioctl_submit_job(struct drm_device *dev, struct drm_file *file,
                                   struct drm_rocket_job *job)
{
        struct rocket_device *rdev = to_rocket_device(dev);
        struct rocket_file_priv *file_priv = file->driver_priv;
        struct rocket_job *rjob = NULL;
        int ret = 0;

        if (job->task_count == 0)
                return -EINVAL;

        rjob = kzalloc_obj(*rjob);
        if (!rjob)
                return -ENOMEM;

        kref_init(&rjob->refcount);

        rjob->rdev = rdev;

        ret = drm_sched_job_init(&rjob->base,
                                 &file_priv->sched_entity,
                                 1, NULL, file->client_id);
        if (ret)
                goto out_put_job;

        ret = rocket_copy_tasks(dev, file, job, rjob);
        if (ret)
                goto out_cleanup_job;

        ret = drm_gem_objects_lookup(file, u64_to_user_ptr(job->in_bo_handles),
                                     job->in_bo_handle_count, &rjob->in_bos);
        if (ret)
                goto out_cleanup_job;

        rjob->in_bo_count = job->in_bo_handle_count;

        ret = drm_gem_objects_lookup(file, u64_to_user_ptr(job->out_bo_handles),
                                     job->out_bo_handle_count, &rjob->out_bos);
        if (ret)
                goto out_cleanup_job;

        rjob->out_bo_count = job->out_bo_handle_count;

        rjob->domain = rocket_iommu_domain_get(file_priv);

        ret = rocket_job_push(rjob);
        if (ret)
                goto out_cleanup_job;

out_cleanup_job:
        if (ret)
                drm_sched_job_cleanup(&rjob->base);
out_put_job:
        rocket_job_put(rjob);

        return ret;
}

int rocket_ioctl_submit(struct drm_device *dev, void *data, struct drm_file *file)
{
        struct drm_rocket_submit *args = data;
        struct drm_rocket_job *jobs;
        int ret = 0;
        unsigned int i = 0;

        if (args->job_count == 0)
                return 0;

        if (args->job_struct_size < sizeof(struct drm_rocket_job)) {
                drm_dbg(dev, "job_struct_size field in drm_rocket_submit struct is too small.\n");
                return -EINVAL;
        }

        if (args->reserved != 0) {
                drm_dbg(dev, "Reserved field in drm_rocket_submit struct should be 0.\n");
                return -EINVAL;
        }

        jobs = kvmalloc_objs(*jobs, args->job_count);
        if (!jobs) {
                drm_dbg(dev, "Failed to allocate incoming job array\n");
                return -ENOMEM;
        }

        for (i = 0; i < args->job_count; i++) {
                if (copy_from_user(&jobs[i],
                                   u64_to_user_ptr(args->jobs) + i * args->job_struct_size,
                                   sizeof(*jobs))) {
                        ret = -EFAULT;
                        drm_dbg(dev, "Failed to copy incoming job array\n");
                        goto exit;
                }
        }


        for (i = 0; i < args->job_count; i++)
                rocket_ioctl_submit_job(dev, file, &jobs[i]);

exit:
        kvfree(jobs);

        return ret;
}