#include <linux/slab.h>
#include "kfd_priv.h"
#include "kfd_topology.h"
#include "kfd_svm.h"
void print_queue_properties(struct queue_properties *q)
{
if (!q)
return;
pr_debug("Printing queue properties:\n");
pr_debug("Queue Type: %u\n", q->type);
pr_debug("Queue Size: %llu\n", q->queue_size);
pr_debug("Queue percent: %u\n", q->queue_percent);
pr_debug("Queue Address: 0x%llX\n", q->queue_address);
pr_debug("Queue Id: %u\n", q->queue_id);
pr_debug("Queue Process Vmid: %u\n", q->vmid);
pr_debug("Queue Read Pointer: 0x%px\n", q->read_ptr);
pr_debug("Queue Write Pointer: 0x%px\n", q->write_ptr);
pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr);
pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off);
}
void print_queue(struct queue *q)
{
if (!q)
return;
pr_debug("Printing queue:\n");
pr_debug("Queue Type: %u\n", q->properties.type);
pr_debug("Queue Size: %llu\n", q->properties.queue_size);
pr_debug("Queue percent: %u\n", q->properties.queue_percent);
pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address);
pr_debug("Queue Id: %u\n", q->properties.queue_id);
pr_debug("Queue Process Vmid: %u\n", q->properties.vmid);
pr_debug("Queue Read Pointer: 0x%px\n", q->properties.read_ptr);
pr_debug("Queue Write Pointer: 0x%px\n", q->properties.write_ptr);
pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr);
pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off);
pr_debug("Queue MQD Address: 0x%p\n", q->mqd);
pr_debug("Queue MQD Gart: 0x%llX\n", q->gart_mqd_addr);
pr_debug("Queue Process Address: 0x%p\n", q->process);
pr_debug("Queue Device Address: 0x%p\n", q->device);
}
int init_queue(struct queue **q, const struct queue_properties *properties)
{
struct queue *tmp_q;
tmp_q = kzalloc_obj(*tmp_q);
if (!tmp_q)
return -ENOMEM;
memcpy(&tmp_q->properties, properties, sizeof(*properties));
*q = tmp_q;
return 0;
}
void uninit_queue(struct queue *q)
{
kfree(q);
}
#if IS_ENABLED(CONFIG_HSA_AMD_SVM)
static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size)
{
struct kfd_process *p = pdd->process;
struct list_head update_list;
struct svm_range *prange;
int ret = -EINVAL;
INIT_LIST_HEAD(&update_list);
addr >>= PAGE_SHIFT;
size >>= PAGE_SHIFT;
mutex_lock(&p->svms.lock);
while (size) {
uint32_t gpuid, gpuidx;
int r;
prange = svm_range_from_addr(&p->svms, addr, NULL);
if (!prange)
break;
if (!prange->mapped_to_gpu)
break;
r = kfd_process_gpuid_from_node(p, pdd->dev, &gpuid, &gpuidx);
if (r < 0)
break;
if (!test_bit(gpuidx, prange->bitmap_access) &&
!test_bit(gpuidx, prange->bitmap_aip))
break;
if (!(prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED))
break;
list_add(&prange->update_list, &update_list);
if (prange->last - prange->start + 1 >= size) {
size = 0;
break;
}
size -= prange->last - prange->start + 1;
addr += prange->last - prange->start + 1;
}
if (size) {
pr_debug("[0x%llx 0x%llx] not registered\n", addr, addr + size - 1);
goto out_unlock;
}
list_for_each_entry(prange, &update_list, update_list)
atomic_inc(&prange->queue_refcount);
ret = 0;
out_unlock:
mutex_unlock(&p->svms.lock);
return ret;
}
static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size)
{
struct kfd_process *p = pdd->process;
struct svm_range *prange, *pchild;
struct interval_tree_node *node;
unsigned long last;
addr >>= PAGE_SHIFT;
last = addr + (size >> PAGE_SHIFT) - 1;
mutex_lock(&p->svms.lock);
node = interval_tree_iter_first(&p->svms.objects, addr, last);
while (node) {
struct interval_tree_node *next_node;
unsigned long next_start;
prange = container_of(node, struct svm_range, it_node);
next_node = interval_tree_iter_next(node, addr, last);
next_start = min(node->last, last) + 1;
if (atomic_add_unless(&prange->queue_refcount, -1, 0)) {
list_for_each_entry(pchild, &prange->child_list, child_list)
atomic_add_unless(&pchild->queue_refcount, -1, 0);
}
node = next_node;
addr = next_start;
}
mutex_unlock(&p->svms.lock);
}
#else
static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size)
{
return -EINVAL;
}
static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size)
{
}
#endif
int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo,
u64 expected_size)
{
struct amdgpu_bo_va_mapping *mapping;
u64 user_addr;
u64 size;
user_addr = (u64)addr >> AMDGPU_GPU_PAGE_SHIFT;
size = expected_size >> AMDGPU_GPU_PAGE_SHIFT;
mapping = amdgpu_vm_bo_lookup_mapping(vm, user_addr);
if (!mapping)
goto out_err;
if (user_addr != mapping->start ||
(size != 0 && user_addr + size - 1 != mapping->last)) {
pr_debug("expected size 0x%llx not equal to mapping addr 0x%llx size 0x%llx\n",
expected_size, mapping->start << AMDGPU_GPU_PAGE_SHIFT,
(mapping->last - mapping->start + 1) << AMDGPU_GPU_PAGE_SHIFT);
goto out_err;
}
*pbo = amdgpu_bo_ref(mapping->bo_va->base.bo);
mapping->bo_va->queue_refcount++;
return 0;
out_err:
*pbo = NULL;
return -EINVAL;
}
void kfd_queue_buffer_put(struct amdgpu_bo **bo)
{
amdgpu_bo_unref(bo);
}
int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
{
struct kfd_topology_device *topo_dev;
u64 expected_queue_size;
struct amdgpu_vm *vm;
u32 total_cwsr_size;
int err;
topo_dev = kfd_topology_device_by_id(pdd->dev->id);
if (!topo_dev)
return -EINVAL;
if (properties->type == KFD_QUEUE_TYPE_COMPUTE &&
properties->format == KFD_QUEUE_FORMAT_AQL &&
topo_dev->node_props.gfx_target_version >= 70000 &&
topo_dev->node_props.gfx_target_version < 90000)
expected_queue_size =
PAGE_ALIGN(properties->queue_size / 2);
else
expected_queue_size =
PAGE_ALIGN(properties->queue_size + properties->metadata_queue_size);
vm = drm_priv_to_vm(pdd->drm_priv);
err = amdgpu_bo_reserve(vm->root.bo, false);
if (err)
return err;
err = kfd_queue_buffer_get(vm, properties->write_ptr, &properties->wptr_bo, PAGE_SIZE);
if (err)
goto out_err_unreserve;
err = kfd_queue_buffer_get(vm, properties->read_ptr, &properties->rptr_bo, PAGE_SIZE);
if (err)
goto out_err_unreserve;
err = kfd_queue_buffer_get(vm, (void *)properties->queue_address,
&properties->ring_bo, expected_queue_size);
if (err)
goto out_err_unreserve;
if (properties->type != KFD_QUEUE_TYPE_COMPUTE)
goto out_unreserve;
if (properties->eop_ring_buffer_address) {
if (properties->eop_ring_buffer_size < topo_dev->node_props.eop_buffer_size) {
pr_debug("queue eop bo size 0x%x is less than node eop buf size 0x%x\n",
properties->eop_ring_buffer_size,
topo_dev->node_props.eop_buffer_size);
err = -EINVAL;
goto out_err_unreserve;
}
err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address,
&properties->eop_buf_bo,
ALIGN(properties->eop_ring_buffer_size, PAGE_SIZE));
if (err)
goto out_err_unreserve;
}
if (properties->ctl_stack_size != topo_dev->node_props.ctl_stack_size) {
pr_debug("queue ctl stack size 0x%x not equal to node ctl stack size 0x%x\n",
properties->ctl_stack_size,
topo_dev->node_props.ctl_stack_size);
err = -EINVAL;
goto out_err_unreserve;
}
if (properties->ctx_save_restore_area_size < topo_dev->node_props.cwsr_size) {
pr_debug("queue cwsr size 0x%x not sufficient for node cwsr size 0x%x\n",
properties->ctx_save_restore_area_size,
topo_dev->node_props.cwsr_size);
err = -EINVAL;
goto out_err_unreserve;
}
total_cwsr_size = (properties->ctx_save_restore_area_size +
topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask);
total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address,
&properties->cwsr_bo, total_cwsr_size);
if (!err)
goto out_unreserve;
amdgpu_bo_unreserve(vm->root.bo);
err = kfd_queue_buffer_svm_get(pdd, properties->ctx_save_restore_area_address,
total_cwsr_size);
if (err)
goto out_err_release;
return 0;
out_unreserve:
amdgpu_bo_unreserve(vm->root.bo);
return 0;
out_err_unreserve:
amdgpu_bo_unreserve(vm->root.bo);
out_err_release:
kfd_queue_unref_bo_vas(pdd, properties);
kfd_queue_release_buffers(pdd, properties);
return err;
}
int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
{
struct kfd_topology_device *topo_dev;
u32 total_cwsr_size;
kfd_queue_buffer_put(&properties->wptr_bo);
kfd_queue_buffer_put(&properties->rptr_bo);
kfd_queue_buffer_put(&properties->ring_bo);
kfd_queue_buffer_put(&properties->eop_buf_bo);
kfd_queue_buffer_put(&properties->cwsr_bo);
topo_dev = kfd_topology_device_by_id(pdd->dev->id);
if (!topo_dev)
return -EINVAL;
total_cwsr_size = (properties->ctx_save_restore_area_size +
topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask);
total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, total_cwsr_size);
return 0;
}
void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo)
{
if (*bo) {
struct amdgpu_bo_va *bo_va;
bo_va = amdgpu_vm_bo_find(vm, *bo);
if (bo_va && bo_va->queue_refcount)
bo_va->queue_refcount--;
}
}
int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd,
struct queue_properties *properties)
{
struct amdgpu_vm *vm;
int err;
vm = drm_priv_to_vm(pdd->drm_priv);
err = amdgpu_bo_reserve(vm->root.bo, false);
if (err)
return err;
kfd_queue_unref_bo_va(vm, &properties->wptr_bo);
kfd_queue_unref_bo_va(vm, &properties->rptr_bo);
kfd_queue_unref_bo_va(vm, &properties->ring_bo);
kfd_queue_unref_bo_va(vm, &properties->eop_buf_bo);
kfd_queue_unref_bo_va(vm, &properties->cwsr_bo);
amdgpu_bo_unreserve(vm->root.bo);
return 0;
}
#define DEBUGGER_BYTES_ALIGN 64
#define DEBUGGER_BYTES_PER_WAVE 32
static u32 kfd_get_sgpr_size_per_cu(u32 gfxv)
{
u32 sgpr_size = 0x4000;
if (gfxv == 120500 ||
gfxv == 120501)
sgpr_size = 0x8000;
return sgpr_size;
}
static u32 kfd_get_vgpr_size_per_cu(u32 gfxv)
{
u32 vgpr_size = 0x40000;
if (gfxv == 90402 ||
gfxv == 90010 ||
gfxv == 90008 ||
gfxv == 90500)
vgpr_size = 0x80000;
else if (gfxv == 110000 ||
gfxv == 110001 ||
gfxv == 110501 ||
gfxv == 120000 ||
gfxv == 120001)
vgpr_size = 0x60000;
else if (gfxv == 120500 ||
gfxv == 120501)
vgpr_size = 0x80000;
return vgpr_size;
}
static u32 kfd_get_hwreg_size_per_cu(u32 gfxv)
{
u32 hwreg_size = 0x1000;
if (gfxv == 120500 || gfxv == 120501)
hwreg_size = 0x8000;
return hwreg_size;
}
static u32 kfd_get_lds_size_per_cu(u32 gfxv, struct kfd_node_properties *props)
{
u32 lds_size = 0x10000;
if (gfxv == 90500 || gfxv == 120500 || gfxv == 120501)
lds_size = props->lds_size_in_kb << 10;
return lds_size;
}
static u32 get_num_waves(struct kfd_node_properties *props, u32 gfxv, u32 cu_num)
{
u32 wave_num = 0;
if (gfxv < 100100)
wave_num = min(cu_num * 40,
props->array_count / props->simd_arrays_per_engine * 512);
else if (gfxv < 120500)
wave_num = cu_num * 32;
else if (gfxv <= 120501)
wave_num = cu_num * 64;
WARN_ON(wave_num == 0);
return wave_num;
}
#define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props) \
(kfd_get_vgpr_size_per_cu(gfxv) + kfd_get_sgpr_size_per_cu(gfxv) +\
kfd_get_lds_size_per_cu(gfxv, props) + kfd_get_hwreg_size_per_cu(gfxv))
#define CNTL_STACK_BYTES_PER_WAVE(gfxv) \
((gfxv) >= 100100 ? 12 : 8)
#define SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER 40
void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev)
{
struct kfd_node_properties *props = &dev->node_props;
u32 gfxv = props->gfx_target_version;
u32 ctl_stack_size;
u32 wg_data_size;
u32 wave_num;
u32 cu_num;
if (gfxv < 80001)
return;
cu_num = props->simd_count / props->simd_per_cu / NUM_XCC(dev->gpu->xcc_mask);
wave_num = get_num_waves(props, gfxv, cu_num);
wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props),
AMDGPU_GPU_PAGE_SIZE);
ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8;
ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + ctl_stack_size,
AMDGPU_GPU_PAGE_SIZE);
if ((gfxv / 10000 * 10000) == 100000) {
ctl_stack_size = min(ctl_stack_size, 0x7000);
}
props->ctl_stack_size = ctl_stack_size;
props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN);
props->cwsr_size = ALIGN(ctl_stack_size + wg_data_size, PAGE_SIZE);
if (gfxv == 80002)
props->eop_buffer_size = 0x8000;
else if (gfxv == 90402)
props->eop_buffer_size = 4096;
else if (gfxv >= 80000)
props->eop_buffer_size = 4096;
}