#include <sys/cdefs.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
#include "core_priv.h"
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("core kernel InfiniBand API");
MODULE_LICENSE("Dual BSD/GPL");
struct ib_client_data {
struct list_head list;
struct ib_client *client;
void * data;
bool going_down;
};
struct workqueue_struct *ib_comp_wq;
struct workqueue_struct *ib_wq;
EXPORT_SYMBOL_GPL(ib_wq);
static LIST_HEAD(device_list);
static LIST_HEAD(client_list);
static DEFINE_MUTEX(device_mutex);
static DECLARE_RWSEM(lists_rwsem);
static int ib_device_check_mandatory(struct ib_device *device)
{
#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
static const struct {
size_t offset;
char *name;
} mandatory_table[] = {
IB_MANDATORY_FUNC(query_device),
IB_MANDATORY_FUNC(query_port),
IB_MANDATORY_FUNC(query_pkey),
IB_MANDATORY_FUNC(query_gid),
IB_MANDATORY_FUNC(alloc_pd),
IB_MANDATORY_FUNC(dealloc_pd),
IB_MANDATORY_FUNC(create_ah),
IB_MANDATORY_FUNC(destroy_ah),
IB_MANDATORY_FUNC(create_qp),
IB_MANDATORY_FUNC(modify_qp),
IB_MANDATORY_FUNC(destroy_qp),
IB_MANDATORY_FUNC(post_send),
IB_MANDATORY_FUNC(post_recv),
IB_MANDATORY_FUNC(create_cq),
IB_MANDATORY_FUNC(destroy_cq),
IB_MANDATORY_FUNC(poll_cq),
IB_MANDATORY_FUNC(req_notify_cq),
IB_MANDATORY_FUNC(get_dma_mr),
IB_MANDATORY_FUNC(dereg_mr),
IB_MANDATORY_FUNC(get_port_immutable)
};
int i;
for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
if (!*(void **) ((char *) device + mandatory_table[i].offset)) {
pr_warn("Device %s is missing mandatory function %s\n",
device->name, mandatory_table[i].name);
return -EINVAL;
}
}
return 0;
}
static struct ib_device *__ib_device_get_by_name(const char *name)
{
struct ib_device *device;
list_for_each_entry(device, &device_list, core_list)
if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
return device;
return NULL;
}
static int alloc_name(char *name)
{
unsigned long *inuse;
char buf[IB_DEVICE_NAME_MAX];
struct ib_device *device;
int i;
inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
if (!inuse)
return -ENOMEM;
list_for_each_entry(device, &device_list, core_list) {
if (!sscanf(device->name, name, &i))
continue;
if (i < 0 || i >= PAGE_SIZE * 8)
continue;
snprintf(buf, sizeof buf, name, i);
if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
set_bit(i, inuse);
}
i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
free_page((unsigned long) inuse);
snprintf(buf, sizeof buf, name, i);
if (__ib_device_get_by_name(buf))
return -ENFILE;
strlcpy(name, buf, IB_DEVICE_NAME_MAX);
return 0;
}
static void ib_device_release(struct device *device)
{
struct ib_device *dev = container_of(device, struct ib_device, dev);
WARN_ON(dev->reg_state == IB_DEV_REGISTERED);
if (dev->reg_state == IB_DEV_UNREGISTERED) {
ib_cache_release_one(dev);
kfree(dev->port_immutable);
}
kfree(dev);
}
static struct class ib_class = {
.name = "infiniband",
.dev_release = ib_device_release,
};
struct ib_device *ib_alloc_device(size_t size)
{
struct ib_device *device;
if (WARN_ON(size < sizeof(struct ib_device)))
return NULL;
device = kzalloc(size, GFP_KERNEL);
if (!device)
return NULL;
device->dev.parent = &linux_root_device;
device->dev.class = &ib_class;
device_initialize(&device->dev);
dev_set_drvdata(&device->dev, device);
INIT_LIST_HEAD(&device->event_handler_list);
spin_lock_init(&device->event_handler_lock);
spin_lock_init(&device->client_data_lock);
INIT_LIST_HEAD(&device->client_data_list);
INIT_LIST_HEAD(&device->port_list);
return device;
}
EXPORT_SYMBOL(ib_alloc_device);
void ib_dealloc_device(struct ib_device *device)
{
WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
device->reg_state != IB_DEV_UNINITIALIZED);
kobject_put(&device->dev.kobj);
}
EXPORT_SYMBOL(ib_dealloc_device);
static int add_client_context(struct ib_device *device, struct ib_client *client)
{
struct ib_client_data *context;
unsigned long flags;
context = kmalloc(sizeof *context, GFP_KERNEL);
if (!context) {
pr_warn("Couldn't allocate client context for %s/%s\n",
device->name, client->name);
return -ENOMEM;
}
context->client = client;
context->data = NULL;
context->going_down = false;
down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_add(&context->list, &device->client_data_list);
spin_unlock_irqrestore(&device->client_data_lock, flags);
up_write(&lists_rwsem);
return 0;
}
static int verify_immutable(const struct ib_device *dev, u8 port)
{
return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
rdma_max_mad_size(dev, port) != 0);
}
static int read_port_immutable(struct ib_device *device)
{
int ret;
u8 start_port = rdma_start_port(device);
u8 end_port = rdma_end_port(device);
u8 port;
device->port_immutable = kzalloc(sizeof(*device->port_immutable)
* (end_port + 1),
GFP_KERNEL);
if (!device->port_immutable)
return -ENOMEM;
for (port = start_port; port <= end_port; ++port) {
ret = device->get_port_immutable(device, port,
&device->port_immutable[port]);
if (ret)
return ret;
if (verify_immutable(device, port))
return -EINVAL;
}
return 0;
}
void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len)
{
if (dev->get_dev_fw_str)
dev->get_dev_fw_str(dev, str, str_len);
else
str[0] = '\0';
}
EXPORT_SYMBOL(ib_get_device_fw_str);
int ib_register_device(struct ib_device *device,
int (*port_callback)(struct ib_device *,
u8, struct kobject *))
{
int ret;
struct ib_client *client;
struct ib_udata uhw = {.outlen = 0, .inlen = 0};
mutex_lock(&device_mutex);
if (strchr(device->name, '%')) {
ret = alloc_name(device->name);
if (ret)
goto out;
}
if (ib_device_check_mandatory(device)) {
ret = -EINVAL;
goto out;
}
ret = read_port_immutable(device);
if (ret) {
pr_warn("Couldn't create per port immutable data %s\n",
device->name);
goto out;
}
ret = ib_cache_setup_one(device);
if (ret) {
pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n");
goto port_cleanup;
}
memset(&device->attrs, 0, sizeof(device->attrs));
ret = device->query_device(device, &device->attrs, &uhw);
if (ret) {
pr_warn("Couldn't query the device attributes\n");
goto cache_cleanup;
}
ret = ib_device_register_sysfs(device, port_callback);
if (ret) {
pr_warn("Couldn't register device %s with driver model\n",
device->name);
goto cache_cleanup;
}
device->reg_state = IB_DEV_REGISTERED;
list_for_each_entry(client, &client_list, list)
if (client->add && !add_client_context(device, client))
client->add(device);
down_write(&lists_rwsem);
list_add_tail(&device->core_list, &device_list);
up_write(&lists_rwsem);
mutex_unlock(&device_mutex);
return 0;
cache_cleanup:
ib_cache_cleanup_one(device);
ib_cache_release_one(device);
port_cleanup:
kfree(device->port_immutable);
out:
mutex_unlock(&device_mutex);
return ret;
}
EXPORT_SYMBOL(ib_register_device);
void ib_unregister_device(struct ib_device *device)
{
struct ib_client_data *context, *tmp;
unsigned long flags;
mutex_lock(&device_mutex);
down_write(&lists_rwsem);
list_del(&device->core_list);
spin_lock_irqsave(&device->client_data_lock, flags);
list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
context->going_down = true;
spin_unlock_irqrestore(&device->client_data_lock, flags);
downgrade_write(&lists_rwsem);
list_for_each_entry_safe(context, tmp, &device->client_data_list,
list) {
if (context->client->remove)
context->client->remove(device, context->data);
}
up_read(&lists_rwsem);
mutex_unlock(&device_mutex);
ib_device_unregister_sysfs(device);
ib_cache_cleanup_one(device);
down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
kfree(context);
spin_unlock_irqrestore(&device->client_data_lock, flags);
up_write(&lists_rwsem);
device->reg_state = IB_DEV_UNREGISTERED;
}
EXPORT_SYMBOL(ib_unregister_device);
int ib_register_client(struct ib_client *client)
{
struct ib_device *device;
mutex_lock(&device_mutex);
list_for_each_entry(device, &device_list, core_list)
if (client->add && !add_client_context(device, client))
client->add(device);
down_write(&lists_rwsem);
list_add_tail(&client->list, &client_list);
up_write(&lists_rwsem);
mutex_unlock(&device_mutex);
return 0;
}
EXPORT_SYMBOL(ib_register_client);
void ib_unregister_client(struct ib_client *client)
{
struct ib_client_data *context, *tmp;
struct ib_device *device;
unsigned long flags;
mutex_lock(&device_mutex);
down_write(&lists_rwsem);
list_del(&client->list);
up_write(&lists_rwsem);
list_for_each_entry(device, &device_list, core_list) {
struct ib_client_data *found_context = NULL;
down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
if (context->client == client) {
context->going_down = true;
found_context = context;
break;
}
spin_unlock_irqrestore(&device->client_data_lock, flags);
up_write(&lists_rwsem);
if (client->remove)
client->remove(device, found_context ?
found_context->data : NULL);
if (!found_context) {
pr_warn("No client context found for %s/%s\n",
device->name, client->name);
continue;
}
down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_del(&found_context->list);
kfree(found_context);
spin_unlock_irqrestore(&device->client_data_lock, flags);
up_write(&lists_rwsem);
}
mutex_unlock(&device_mutex);
}
EXPORT_SYMBOL(ib_unregister_client);
void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
{
struct ib_client_data *context;
void *ret = NULL;
unsigned long flags;
spin_lock_irqsave(&device->client_data_lock, flags);
list_for_each_entry(context, &device->client_data_list, list)
if (context->client == client) {
ret = context->data;
break;
}
spin_unlock_irqrestore(&device->client_data_lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_get_client_data);
void ib_set_client_data(struct ib_device *device, struct ib_client *client,
void *data)
{
struct ib_client_data *context;
unsigned long flags;
spin_lock_irqsave(&device->client_data_lock, flags);
list_for_each_entry(context, &device->client_data_list, list)
if (context->client == client) {
context->data = data;
goto out;
}
pr_warn("No client context found for %s/%s\n",
device->name, client->name);
out:
spin_unlock_irqrestore(&device->client_data_lock, flags);
}
EXPORT_SYMBOL(ib_set_client_data);
int ib_register_event_handler (struct ib_event_handler *event_handler)
{
unsigned long flags;
spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
list_add_tail(&event_handler->list,
&event_handler->device->event_handler_list);
spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
return 0;
}
EXPORT_SYMBOL(ib_register_event_handler);
int ib_unregister_event_handler(struct ib_event_handler *event_handler)
{
unsigned long flags;
spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
list_del(&event_handler->list);
spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
return 0;
}
EXPORT_SYMBOL(ib_unregister_event_handler);
void ib_dispatch_event(struct ib_event *event)
{
unsigned long flags;
struct ib_event_handler *handler;
spin_lock_irqsave(&event->device->event_handler_lock, flags);
list_for_each_entry(handler, &event->device->event_handler_list, list)
handler->handler(handler, event);
spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
}
EXPORT_SYMBOL(ib_dispatch_event);
int ib_query_port(struct ib_device *device,
u8 port_num,
struct ib_port_attr *port_attr)
{
union ib_gid gid;
int err;
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
memset(port_attr, 0, sizeof(*port_attr));
err = device->query_port(device, port_num, port_attr);
if (err || port_attr->subnet_prefix)
return err;
if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND)
return 0;
err = ib_query_gid(device, port_num, 0, &gid, NULL);
if (err)
return err;
port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
return 0;
}
EXPORT_SYMBOL(ib_query_port);
int ib_query_gid(struct ib_device *device,
u8 port_num, int index, union ib_gid *gid,
struct ib_gid_attr *attr)
{
if (rdma_cap_roce_gid_table(device, port_num))
return ib_get_cached_gid(device, port_num, index, gid, attr);
if (attr)
return -EINVAL;
return device->query_gid(device, port_num, index, gid);
}
EXPORT_SYMBOL(ib_query_gid);
void ib_enum_roce_netdev(struct ib_device *ib_dev,
roce_netdev_filter filter,
void *filter_cookie,
roce_netdev_callback cb,
void *cookie)
{
u8 port;
for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
port++)
if (rdma_protocol_roce(ib_dev, port)) {
if_t idev = NULL;
if (ib_dev->get_netdev)
idev = ib_dev->get_netdev(ib_dev, port);
if (idev && (if_getflags(idev) & IFF_DYING)) {
dev_put(idev);
idev = NULL;
}
if (filter(ib_dev, port, idev, filter_cookie))
cb(ib_dev, port, idev, cookie);
if (idev)
dev_put(idev);
}
}
void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
void *filter_cookie,
roce_netdev_callback cb,
void *cookie)
{
struct ib_device *dev;
down_read(&lists_rwsem);
list_for_each_entry(dev, &device_list, core_list)
ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
up_read(&lists_rwsem);
}
void ib_cache_gid_del_all_by_netdev(if_t ndev)
{
struct ib_device *ib_dev;
u8 port;
down_read(&lists_rwsem);
list_for_each_entry(ib_dev, &device_list, core_list) {
for (port = rdma_start_port(ib_dev);
port <= rdma_end_port(ib_dev);
port++) {
if (rdma_protocol_roce(ib_dev, port) == 0)
continue;
(void) ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev);
}
}
up_read(&lists_rwsem);
}
int ib_query_pkey(struct ib_device *device,
u8 port_num, u16 index, u16 *pkey)
{
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
return device->query_pkey(device, port_num, index, pkey);
}
EXPORT_SYMBOL(ib_query_pkey);
int ib_modify_device(struct ib_device *device,
int device_modify_mask,
struct ib_device_modify *device_modify)
{
if (!device->modify_device)
return -ENOSYS;
return device->modify_device(device, device_modify_mask,
device_modify);
}
EXPORT_SYMBOL(ib_modify_device);
int ib_modify_port(struct ib_device *device,
u8 port_num, int port_modify_mask,
struct ib_port_modify *port_modify)
{
if (!device->modify_port)
return -ENOSYS;
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
return device->modify_port(device, port_num, port_modify_mask,
port_modify);
}
EXPORT_SYMBOL(ib_modify_port);
int ib_find_gid(struct ib_device *device, union ib_gid *gid,
enum ib_gid_type gid_type, if_t ndev,
u8 *port_num, u16 *index)
{
union ib_gid tmp_gid;
int ret, port, i;
for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
if (rdma_cap_roce_gid_table(device, port)) {
if (!ib_find_cached_gid_by_port(device, gid, gid_type, port,
ndev, index)) {
*port_num = port;
return 0;
}
}
if (gid_type != IB_GID_TYPE_IB)
continue;
for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
ret = ib_query_gid(device, port, i, &tmp_gid, NULL);
if (ret)
return ret;
if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
*port_num = port;
if (index)
*index = i;
return 0;
}
}
}
return -ENOENT;
}
EXPORT_SYMBOL(ib_find_gid);
int ib_find_pkey(struct ib_device *device,
u8 port_num, u16 pkey, u16 *index)
{
int ret, i;
u16 tmp_pkey;
int partial_ix = -1;
for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) {
ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
if (ret)
return ret;
if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
if (tmp_pkey & 0x8000) {
*index = i;
return 0;
}
if (partial_ix < 0)
partial_ix = i;
}
}
if (partial_ix >= 0) {
*index = partial_ix;
return 0;
}
return -ENOENT;
}
EXPORT_SYMBOL(ib_find_pkey);
if_t ib_get_net_dev_by_params(struct ib_device *dev,
u8 port,
u16 pkey,
const union ib_gid *gid,
const struct sockaddr *addr)
{
if_t net_dev = NULL;
struct ib_client_data *context;
if (!rdma_protocol_ib(dev, port))
return NULL;
down_read(&lists_rwsem);
list_for_each_entry(context, &dev->client_data_list, list) {
struct ib_client *client = context->client;
if (context->going_down)
continue;
if (client->get_net_dev_by_params) {
net_dev = client->get_net_dev_by_params(dev, port, pkey,
gid, addr,
context->data);
if (net_dev)
break;
}
}
up_read(&lists_rwsem);
return net_dev;
}
EXPORT_SYMBOL(ib_get_net_dev_by_params);
static int __init ib_core_init(void)
{
int ret;
ib_wq = alloc_workqueue("infiniband", 0, 0);
if (!ib_wq)
return -ENOMEM;
ib_comp_wq = alloc_workqueue("ib-comp-wq",
WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM,
mp_ncpus * 4 );
if (!ib_comp_wq) {
ret = -ENOMEM;
goto err;
}
ret = class_register(&ib_class);
if (ret) {
pr_warn("Couldn't create InfiniBand device class\n");
goto err_comp;
}
ret = addr_init();
if (ret) {
pr_warn("Could't init IB address resolution\n");
goto err_sysfs;
}
ret = ib_mad_init();
if (ret) {
pr_warn("Couldn't init IB MAD\n");
goto err_addr;
}
ret = ib_sa_init();
if (ret) {
pr_warn("Couldn't init SA\n");
goto err_mad;
}
ib_cache_setup();
return 0;
err_mad:
ib_mad_cleanup();
err_addr:
addr_cleanup();
err_sysfs:
class_unregister(&ib_class);
err_comp:
destroy_workqueue(ib_comp_wq);
err:
destroy_workqueue(ib_wq);
return ret;
}
static void __exit ib_core_cleanup(void)
{
ib_cache_cleanup();
ib_sa_cleanup();
ib_mad_cleanup();
addr_cleanup();
class_unregister(&ib_class);
destroy_workqueue(ib_comp_wq);
destroy_workqueue(ib_wq);
}
module_init_order(ib_core_init, SI_ORDER_FIRST);
module_exit_order(ib_core_cleanup, SI_ORDER_FIRST);
MODULE_VERSION(ibcore, 1);
MODULE_DEPEND(ibcore, linuxkpi, 1, 1, 1);