/* ldc.c: Logical Domain Channel link-layer protocol driver. * * Copyright (C) 2007 David S. Miller */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DRV_MODULE_NAME "ldc" #define PFX DRV_MODULE_NAME ": " #define DRV_MODULE_VERSION "1.0" #define DRV_MODULE_RELDATE "June 25, 2007" static char version[] __devinitdata = DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n"; #define LDC_PACKET_SIZE 64 /* Packet header layout for unreliable and reliable mode frames. * When in RAW mode, packets are simply straight 64-byte payloads * with no headers. */ struct ldc_packet { u8 type; #define LDC_CTRL 0x01 #define LDC_DATA 0x02 #define LDC_ERR 0x10 u8 stype; #define LDC_INFO 0x01 #define LDC_ACK 0x02 #define LDC_NACK 0x04 u8 ctrl; #define LDC_VERS 0x01 /* Link Version */ #define LDC_RTS 0x02 /* Request To Send */ #define LDC_RTR 0x03 /* Ready To Receive */ #define LDC_RDX 0x04 /* Ready for Data eXchange */ #define LDC_CTRL_MSK 0x0f u8 env; #define LDC_LEN 0x3f #define LDC_FRAG_MASK 0xc0 #define LDC_START 0x40 #define LDC_STOP 0x80 u32 seqid; union { u8 u_data[LDC_PACKET_SIZE - 8]; struct { u32 pad; u32 ackid; u8 r_data[LDC_PACKET_SIZE - 8 - 8]; } r; } u; }; struct ldc_version { u16 major; u16 minor; }; /* Ordered from largest major to lowest. */ static struct ldc_version ver_arr[] = { { .major = 1, .minor = 0 }, }; #define LDC_DEFAULT_MTU (4 * LDC_PACKET_SIZE) #define LDC_DEFAULT_NUM_ENTRIES (PAGE_SIZE / LDC_PACKET_SIZE) struct ldc_channel; struct ldc_mode_ops { int (*write)(struct ldc_channel *, const void *, unsigned int); int (*read)(struct ldc_channel *, void *, unsigned int); }; static const struct ldc_mode_ops raw_ops; static const struct ldc_mode_ops nonraw_ops; static const struct ldc_mode_ops stream_ops; int ldom_domaining_enabled; struct ldc_iommu { /* Protects arena alloc/free. */ spinlock_t lock; struct iommu_arena arena; struct ldc_mtable_entry *page_table; }; struct ldc_channel { /* Protects all operations that depend upon channel state. */ spinlock_t lock; unsigned long id; u8 *mssbuf; u32 mssbuf_len; u32 mssbuf_off; struct ldc_packet *tx_base; unsigned long tx_head; unsigned long tx_tail; unsigned long tx_num_entries; unsigned long tx_ra; unsigned long tx_acked; struct ldc_packet *rx_base; unsigned long rx_head; unsigned long rx_tail; unsigned long rx_num_entries; unsigned long rx_ra; u32 rcv_nxt; u32 snd_nxt; unsigned long chan_state; struct ldc_channel_config cfg; void *event_arg; const struct ldc_mode_ops *mops; struct ldc_iommu iommu; struct ldc_version ver; u8 hs_state; #define LDC_HS_CLOSED 0x00 #define LDC_HS_OPEN 0x01 #define LDC_HS_GOTVERS 0x02 #define LDC_HS_SENTRTR 0x03 #define LDC_HS_GOTRTR 0x04 #define LDC_HS_COMPLETE 0x10 u8 flags; #define LDC_FLAG_ALLOCED_QUEUES 0x01 #define LDC_FLAG_REGISTERED_QUEUES 0x02 #define LDC_FLAG_REGISTERED_IRQS 0x04 #define LDC_FLAG_RESET 0x10 u8 mss; u8 state; #define LDC_IRQ_NAME_MAX 32 char rx_irq_name[LDC_IRQ_NAME_MAX]; char tx_irq_name[LDC_IRQ_NAME_MAX]; struct hlist_head mh_list; struct hlist_node list; }; #define ldcdbg(TYPE, f, a...) \ do { if (lp->cfg.debug & LDC_DEBUG_##TYPE) \ printk(KERN_INFO PFX "ID[%lu] " f, lp->id, ## a); \ } while (0) static const char *state_to_str(u8 state) { switch (state) { case LDC_STATE_INVALID: return "INVALID"; case LDC_STATE_INIT: return "INIT"; case LDC_STATE_BOUND: return "BOUND"; case LDC_STATE_READY: return "READY"; case LDC_STATE_CONNECTED: return "CONNECTED"; default: return ""; } } static void ldc_set_state(struct ldc_channel *lp, u8 state) { ldcdbg(STATE, "STATE (%s) --> (%s)\n", state_to_str(lp->state), state_to_str(state)); lp->state = state; } static unsigned long __advance(unsigned long off, unsigned long num_entries) { off += LDC_PACKET_SIZE; if (off == (num_entries * LDC_PACKET_SIZE)) off = 0; return off; } static unsigned long rx_advance(struct ldc_channel *lp, unsigned long off) { return __advance(off, lp->rx_num_entries); } static unsigned long tx_advance(struct ldc_channel *lp, unsigned long off) { return __advance(off, lp->tx_num_entries); } static struct ldc_packet *handshake_get_tx_packet(struct ldc_channel *lp, unsigned long *new_tail) { struct ldc_packet *p; unsigned long t; t = tx_advance(lp, lp->tx_tail); if (t == lp->tx_head) return NULL; *new_tail = t; p = lp->tx_base; return p + (lp->tx_tail / LDC_PACKET_SIZE); } /* When we are in reliable or stream mode, have to track the next packet * we haven't gotten an ACK for in the TX queue using tx_acked. We have * to be careful not to stomp over the queue past that point. During * the handshake, we don't have TX data packets pending in the queue * and that's why handshake_get_tx_packet() need not be mindful of * lp->tx_acked. */ static unsigned long head_for_data(struct ldc_channel *lp) { if (lp->cfg.mode == LDC_MODE_STREAM) return lp->tx_acked; return lp->tx_head; } static int tx_has_space_for(struct ldc_channel *lp, unsigned int size) { unsigned long limit, tail, new_tail, diff; unsigned int mss; limit = head_for_data(lp); tail = lp->tx_tail; new_tail = tx_advance(lp, tail); if (new_tail == limit) return 0; if (limit > new_tail) diff = limit - new_tail; else diff = (limit + ((lp->tx_num_entries * LDC_PACKET_SIZE) - new_tail)); diff /= LDC_PACKET_SIZE; mss = lp->mss; if (diff * mss < size) return 0; return 1; } static struct ldc_packet *data_get_tx_packet(struct ldc_channel *lp, unsigned long *new_tail) { struct ldc_packet *p; unsigned long h, t; h = head_for_data(lp); t = tx_advance(lp, lp->tx_tail); if (t == h) return NULL; *new_tail = t; p = lp->tx_base; return p + (lp->tx_tail / LDC_PACKET_SIZE); } static int set_tx_tail(struct ldc_channel *lp, unsigned long tail) { unsigned long orig_tail = lp->tx_tail; int limit = 1000; lp->tx_tail = tail; while (limit-- > 0) { unsigned long err; err = sun4v_ldc_tx_set_qtail(lp->id, tail); if (!err) return 0; if (err != HV_EWOULDBLOCK) { lp->tx_tail = orig_tail; return -EINVAL; } udelay(1); } lp->tx_tail = orig_tail; return -EBUSY; } /* This just updates the head value in the hypervisor using * a polling loop with a timeout. The caller takes care of * upating software state representing the head change, if any. */ static int __set_rx_head(struct ldc_channel *lp, unsigned long head) { int limit = 1000; while (limit-- > 0) { unsigned long err; err = sun4v_ldc_rx_set_qhead(lp->id, head); if (!err) return 0; if (err != HV_EWOULDBLOCK) return -EINVAL; udelay(1); } return -EBUSY; } static int send_tx_packet(struct ldc_channel *lp, struct ldc_packet *p, unsigned long new_tail) { BUG_ON(p != (lp->tx_base + (lp->tx_tail / LDC_PACKET_SIZE))); return set_tx_tail(lp, new_tail); } static struct ldc_packet *handshake_compose_ctrl(struct ldc_channel *lp, u8 stype, u8 ctrl, void *data, int dlen, unsigned long *new_tail) { struct ldc_packet *p = handshake_get_tx_packet(lp, new_tail); if (p) { memset(p, 0, sizeof(*p)); p->type = LDC_CTRL; p->stype = stype; p->ctrl = ctrl; if (data) memcpy(p->u.u_data, data, dlen); } return p; } static int start_handshake(struct ldc_channel *lp) { struct ldc_packet *p; struct ldc_version *ver; unsigned long new_tail; ver = &ver_arr[0]; ldcdbg(HS, "SEND VER INFO maj[%u] min[%u]\n", ver->major, ver->minor); p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS, ver, sizeof(*ver), &new_tail); if (p) { int err = send_tx_packet(lp, p, new_tail); if (!err) lp->flags &= ~LDC_FLAG_RESET; return err; } return -EBUSY; } static int send_version_nack(struct ldc_channel *lp, u16 major, u16 minor) { struct ldc_packet *p; struct ldc_version ver; unsigned long new_tail; ver.major = major; ver.minor = minor; p = handshake_compose_ctrl(lp, LDC_NACK, LDC_VERS, &ver, sizeof(ver), &new_tail); if (p) { ldcdbg(HS, "SEND VER NACK maj[%u] min[%u]\n", ver.major, ver.minor); return send_tx_packet(lp, p, new_tail); } return -EBUSY; } static int send_version_ack(struct ldc_channel *lp, struct ldc_version *vp) { struct ldc_packet *p; unsigned long new_tail; p = handshake_compose_ctrl(lp, LDC_ACK, LDC_VERS, vp, sizeof(*vp), &new_tail); if (p) { ldcdbg(HS, "SEND VER ACK maj[%u] min[%u]\n", vp->major, vp->minor); return send_tx_packet(lp, p, new_tail); } return -EBUSY; } static int send_rts(struct ldc_channel *lp) { struct ldc_packet *p; unsigned long new_tail; p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTS, NULL, 0, &new_tail); if (p) { p->env = lp->cfg.mode; p->seqid = 0; lp->rcv_nxt = 0; ldcdbg(HS, "SEND RTS env[0x%x] seqid[0x%x]\n", p->env, p->seqid); return send_tx_packet(lp, p, new_tail); } return -EBUSY; } static int send_rtr(struct ldc_channel *lp) { struct ldc_packet *p; unsigned long new_tail; p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTR, NULL, 0, &new_tail); if (p) { p->env = lp->cfg.mode; p->seqid = 0; ldcdbg(HS, "SEND RTR env[0x%x] seqid[0x%x]\n", p->env, p->seqid); return send_tx_packet(lp, p, new_tail); } return -EBUSY; } static int send_rdx(struct ldc_channel *lp) { struct ldc_packet *p; unsigned long new_tail; p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RDX, NULL, 0, &new_tail); if (p) { p->env = 0; p->seqid = ++lp->snd_nxt; p->u.r.ackid = lp->rcv_nxt; ldcdbg(HS, "SEND RDX env[0x%x] seqid[0x%x] ackid[0x%x]\n", p->env, p->seqid, p->u.r.ackid); return send_tx_packet(lp, p, new_tail); } return -EBUSY; } static int send_data_nack(struct ldc_channel *lp, struct ldc_packet *data_pkt) { struct ldc_packet *p; unsigned long new_tail; int err; p = data_get_tx_packet(lp, &new_tail); if (!p) return -EBUSY; memset(p, 0, sizeof(*p)); p->type = data_pkt->type; p->stype = LDC_NACK; p->ctrl = data_pkt->ctrl & LDC_CTRL_MSK; p->seqid = lp->snd_nxt + 1; p->u.r.ackid = lp->rcv_nxt; ldcdbg(HS, "SEND DATA NACK type[0x%x] ctl[0x%x] seq[0x%x] ack[0x%x]\n", p->type, p->ctrl, p->seqid, p->u.r.ackid); err = send_tx_packet(lp, p, new_tail); if (!err) lp->snd_nxt++; return err; } static int ldc_abort(struct ldc_channel *lp) { unsigned long hv_err; ldcdbg(STATE, "ABORT\n"); /* We report but do not act upon the hypervisor errors because * there really isn't much we can do if they fail at this point. */ hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries); if (hv_err) printk(KERN_ERR PFX "ldc_abort: " "sun4v_ldc_tx_qconf(%lx,%lx,%lx) failed, err=%lu\n", lp->id, lp->tx_ra, lp->tx_num_entries, hv_err); hv_err = sun4v_ldc_tx_get_state(lp->id, &lp->tx_head, &lp->tx_tail, &lp->chan_state); if (hv_err) printk(KERN_ERR PFX "ldc_abort: " "sun4v_ldc_tx_get_state(%lx,...) failed, err=%lu\n", lp->id, hv_err); hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries); if (hv_err) printk(KERN_ERR PFX "ldc_abort: " "sun4v_ldc_rx_qconf(%lx,%lx,%lx) failed, err=%lu\n", lp->id, lp->rx_ra, lp->rx_num_entries, hv_err); /* Refetch the RX queue state as well, because we could be invoked * here in the queue processing context. */ hv_err = sun4v_ldc_rx_get_state(lp->id, &lp->rx_head, &lp->rx_tail, &lp->chan_state); if (hv_err) printk(KERN_ERR PFX "ldc_abort: " "sun4v_ldc_rx_get_state(%lx,...) failed, err=%lu\n", lp->id, hv_err); return -ECONNRESET; } static struct ldc_version *find_by_major(u16 major) { struct ldc_version *ret = NULL; int i; for (i = 0; i < ARRAY_SIZE(ver_arr); i++) { struct ldc_version *v = &ver_arr[i]; if (v->major <= major) { ret = v; break; } } return ret; } static int process_ver_info(struct ldc_channel *lp, struct ldc_version *vp) { struct ldc_version *vap; int err; ldcdbg(HS, "GOT VERSION INFO major[%x] minor[%x]\n", vp->major, vp->minor); if (lp->hs_state == LDC_HS_GOTVERS) { lp->hs_state = LDC_HS_OPEN; memset(&lp->ver, 0, sizeof(lp->ver)); } vap = find_by_major(vp->major); if (!vap) { err = send_version_nack(lp, 0, 0); } else if (vap->major != vp->major) { err = send_version_nack(lp, vap->major, vap->minor); } else { struct ldc_version ver = *vp; if (ver.minor > vap->minor) ver.minor = vap->minor; err = send_version_ack(lp, &ver); if (!err) { lp->ver = ver; lp->hs_state = LDC_HS_GOTVERS; } } if (err) return ldc_abort(lp); return 0; } static int process_ver_ack(struct ldc_channel *lp, struct ldc_version *vp) { ldcdbg(HS, "GOT VERSION ACK major[%x] minor[%x]\n", vp->major, vp->minor); if (lp->hs_state == LDC_HS_GOTVERS) { if (lp->ver.major != vp->major || lp->ver.minor != vp->minor) return ldc_abort(lp); } else { lp->ver = *vp; lp->hs_state = LDC_HS_GOTVERS; } if (send_rts(lp)) return ldc_abort(lp); return 0; } static int process_ver_nack(struct ldc_channel *lp, struct ldc_version *vp) { struct ldc_version *vap; if ((vp->major == 0 && vp->minor == 0) || !(vap = find_by_major(vp->major))) { return ldc_abort(lp); } else { struct ldc_packet *p; unsigned long new_tail; p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS, vap, sizeof(*vap), &new_tail); if (p) return send_tx_packet(lp, p, new_tail); else return ldc_abort(lp); } } static int process_version(struct ldc_channel *lp, struct ldc_packet *p) { struct ldc_version *vp; vp = (struct ldc_version *) p->u.u_data; switch (p->stype) { case LDC_INFO: return process_ver_info(lp, vp); case LDC_ACK: return process_ver_ack(lp, vp); case LDC_NACK: return process_ver_nack(lp, vp); default: return ldc_abort(lp); } } static int process_rts(struct ldc_channel *lp, struct ldc_packet *p) { ldcdbg(HS, "GOT RTS stype[%x] seqid[%x] env[%x]\n", p->stype, p->seqid, p->env); if (p->stype != LDC_INFO || lp->hs_state != LDC_HS_GOTVERS || p->env != lp->cfg.mode) return ldc_abort(lp); lp->snd_nxt = p->seqid; lp->rcv_nxt = p->seqid; lp->hs_state = LDC_HS_SENTRTR; if (send_rtr(lp)) return ldc_abort(lp); return 0; } static int process_rtr(struct ldc_channel *lp, struct ldc_packet *p) { ldcdbg(HS, "GOT RTR stype[%x] seqid[%x] env[%x]\n", p->stype, p->seqid, p->env); if (p->stype != LDC_INFO || p->env != lp->cfg.mode) return ldc_abort(lp); lp->snd_nxt = p->seqid; lp->hs_state = LDC_HS_COMPLETE; ldc_set_state(lp, LDC_STATE_CONNECTED); send_rdx(lp); return LDC_EVENT_UP; } static int rx_seq_ok(struct ldc_channel *lp, u32 seqid) { return lp->rcv_nxt + 1 == seqid; } static int process_rdx(struct ldc_channel *lp, struct ldc_packet *p) { ldcdbg(HS, "GOT RDX stype[%x] seqid[%x] env[%x] ackid[%x]\n", p->stype, p->seqid, p->env, p->u.r.ackid); if (p->stype != LDC_INFO || !(rx_seq_ok(lp, p->seqid))) return ldc_abort(lp); lp->rcv_nxt = p->seqid; lp->hs_state = LDC_HS_COMPLETE; ldc_set_state(lp, LDC_STATE_CONNECTED); return LDC_EVENT_UP; } static int process_control_frame(struct ldc_channel *lp, struct ldc_packet *p) { switch (p->ctrl) { case LDC_VERS: return process_version(lp, p); case LDC_RTS: return process_rts(lp, p); case LDC_RTR: return process_rtr(lp, p); case LDC_RDX: return process_rdx(lp, p); default: return ldc_abort(lp); } } static int process_error_frame(struct ldc_channel *lp, struct ldc_packet *p) { return ldc_abort(lp); } static int process_data_ack(struct ldc_channel *lp, struct ldc_packet *ack) { unsigned long head = lp->tx_acked; u32 ackid = ack->u.r.ackid; while (1) { struct ldc_packet *p = lp->tx_base + (head / LDC_PACKET_SIZE); head = tx_advance(lp, head); if (p->seqid == ackid) { lp->tx_acked = head; return 0; } if (head == lp->tx_tail) return ldc_abort(lp); } return 0; } static void send_events(struct ldc_channel *lp, unsigned int event_mask) { if (event_mask & LDC_EVENT_RESET) lp->cfg.event(lp->event_arg, LDC_EVENT_RESET); if (event_mask & LDC_EVENT_UP) lp->cfg.event(lp->event_arg, LDC_EVENT_UP); if (event_mask & LDC_EVENT_DATA_READY) lp->cfg.event(lp->event_arg, LDC_EVENT_DATA_READY); } static irqreturn_t ldc_rx(int irq, void *dev_id) { struct ldc_channel *lp = dev_id; unsigned long orig_state, hv_err, flags; unsigned int event_mask; spin_lock_irqsave(&lp->lock, flags); orig_state = lp->chan_state; hv_err = sun4v_ldc_rx_get_state(lp->id, &lp->rx_head, &lp->rx_tail, &lp->chan_state); ldcdbg(RX, "RX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n", orig_state, lp->chan_state, lp->rx_head, lp->rx_tail); event_mask = 0; if (lp->cfg.mode == LDC_MODE_RAW && lp->chan_state == LDC_CHANNEL_UP) { lp->hs_state = LDC_HS_COMPLETE; ldc_set_state(lp, LDC_STATE_CONNECTED); event_mask |= LDC_EVENT_UP; orig_state = lp->chan_state; } /* If we are in reset state, flush the RX queue and ignore * everything. */ if (lp->flags & LDC_FLAG_RESET) { (void) __set_rx_head(lp, lp->rx_tail); goto out; } /* Once we finish the handshake, we let the ldc_read() * paths do all of the control frame and state management. * Just trigger the callback. */ if (lp->hs_state == LDC_HS_COMPLETE) { handshake_complete: if (lp->chan_state != orig_state) { unsigned int event = LDC_EVENT_RESET; if (lp->chan_state == LDC_CHANNEL_UP) event = LDC_EVENT_UP; event_mask |= event; } if (lp->rx_head != lp->rx_tail) event_mask |= LDC_EVENT_DATA_READY; goto out; } if (lp->chan_state != orig_state) goto out; while (lp->rx_head != lp->rx_tail) { struct ldc_packet *p; unsigned long new; int err; p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE); switch (p->type) { case LDC_CTRL: err = process_control_frame(lp, p); if (err > 0) event_mask |= err; break; case LDC_DATA: event_mask |= LDC_EVENT_DATA_READY; err = 0; break; case LDC_ERR: err = process_error_frame(lp, p); break; default: err = ldc_abort(lp); break; } if (err < 0) break; new = lp->rx_head; new += LDC_PACKET_SIZE; if (new == (lp->rx_num_entries * LDC_PACKET_SIZE)) new = 0; lp->rx_head = new; err = __set_rx_head(lp, new); if (err < 0) { (void) ldc_abort(lp); break; } if (lp->hs_state == LDC_HS_COMPLETE) goto handshake_complete; } out: spin_unlock_irqrestore(&lp->lock, flags); send_events(lp, event_mask); return IRQ_HANDLED; } static irqreturn_t ldc_tx(int irq, void *dev_id) { struct ldc_channel *lp = dev_id; unsigned long flags, hv_err, orig_state; unsigned int event_mask = 0; spin_lock_irqsave(&lp->lock, flags); orig_state = lp->chan_state; hv_err = sun4v_ldc_tx_get_state(lp->id, &lp->tx_head, &lp->tx_tail, &lp->chan_state); ldcdbg(TX, " TX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n", orig_state, lp->chan_state, lp->tx_head, lp->tx_tail); if (lp->cfg.mode == LDC_MODE_RAW && lp->chan_state == LDC_CHANNEL_UP) { lp->hs_state = LDC_HS_COMPLETE; ldc_set_state(lp, LDC_STATE_CONNECTED); event_mask |= LDC_EVENT_UP; } spin_unlock_irqrestore(&lp->lock, flags); send_events(lp, event_mask); return IRQ_HANDLED; } /* XXX ldc_alloc() and ldc_free() needs to run under a mutex so * XXX that addition and removal from the ldc_channel_list has * XXX atomicity, otherwise the __ldc_channel_exists() check is * XXX totally pointless as another thread can slip into ldc_alloc() * XXX and add a channel with the same ID. There also needs to be * XXX a spinlock for ldc_channel_list. */ static HLIST_HEAD(ldc_channel_list); static int __ldc_channel_exists(unsigned long id) { struct ldc_channel *lp; struct hlist_node *n; hlist_for_each_entry(lp, n, &ldc_channel_list, list) { if (lp->id == id) return 1; } return 0; } static int alloc_queue(const char *name, unsigned long num_entries, struct ldc_packet **base, unsigned long *ra) { unsigned long size, order; void *q; size = num_entries * LDC_PACKET_SIZE; order = get_order(size); q = (void *) __get_free_pages(GFP_KERNEL, order); if (!q) { printk(KERN_ERR PFX "Alloc of %s queue failed with " "size=%lu order=%lu\n", name, size, order); return -ENOMEM; } memset(q, 0, PAGE_SIZE << order); *base = q; *ra = __pa(q); return 0; } static void free_queue(unsigned long num_entries, struct ldc_packet *q) { unsigned long size, order; if (!q) return; size = num_entries * LDC_PACKET_SIZE; order = get_order(size); free_pages((unsigned long)q, order); } /* XXX Make this configurable... XXX */ #define LDC_IOTABLE_SIZE (8 * 1024) static int ldc_iommu_init(struct ldc_channel *lp) { unsigned long sz, num_tsb_entries, tsbsize, order; struct ldc_iommu *iommu = &lp->iommu; struct ldc_mtable_entry *table; unsigned long hv_err; int err; num_tsb_entries = LDC_IOTABLE_SIZE; tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry); spin_lock_init(&iommu->lock); sz = num_tsb_entries / 8; sz = (sz + 7UL) & ~7UL; iommu->arena.map = kzalloc(sz, GFP_KERNEL); if (!iommu->arena.map) { printk(KERN_ERR PFX "Alloc of arena map failed, sz=%lu\n", sz); return -ENOMEM; } iommu->arena.limit = num_tsb_entries; order = get_order(tsbsize); table = (struct ldc_mtable_entry *) __get_free_pages(GFP_KERNEL, order); err = -ENOMEM; if (!table) { printk(KERN_ERR PFX "Alloc of MTE table failed, " "size=%lu order=%lu\n", tsbsize, order); goto out_free_map; } memset(table, 0, PAGE_SIZE << order); iommu->page_table = table; hv_err = sun4v_ldc_set_map_table(lp->id, __pa(table), num_tsb_entries); err = -EINVAL; if (hv_err) goto out_free_table; return 0; out_free_table: free_pages((unsigned long) table, order); iommu->page_table = NULL; out_free_map: kfree(iommu->arena.map); iommu->arena.map = NULL; return err; } static void ldc_iommu_release(struct ldc_channel *lp) { struct ldc_iommu *iommu = &lp->iommu; unsigned long num_tsb_entries, tsbsize, order; (void) sun4v_ldc_set_map_table(lp->id, 0, 0); num_tsb_entries = iommu->arena.limit; tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry); order = get_order(tsbsize); free_pages((unsigned long) iommu->page_table, order); iommu->page_table = NULL; kfree(iommu->arena.map); iommu->arena.map = NULL; } struct ldc_channel *ldc_alloc(unsigned long id, const struct ldc_channel_config *cfgp, void *event_arg) { struct ldc_channel *lp; const struct ldc_mode_ops *mops; unsigned long dummy1, dummy2, hv_err; u8 mss, *mssbuf; int err; err = -ENODEV; if (!ldom_domaining_enabled) goto out_err; err = -EINVAL; if (!cfgp) goto out_err; switch (cfgp->mode) { case LDC_MODE_RAW: mops = &raw_ops; mss = LDC_PACKET_SIZE; break; case LDC_MODE_UNRELIABLE: mops = &nonraw_ops; mss = LDC_PACKET_SIZE - 8; break; case LDC_MODE_STREAM: mops = &stream_ops; mss = LDC_PACKET_SIZE - 8 - 8; break; default: goto out_err; } if (!cfgp->event || !event_arg || !cfgp->rx_irq || !cfgp->tx_irq) goto out_err; hv_err = sun4v_ldc_tx_qinfo(id, &dummy1, &dummy2); err = -ENODEV; if (hv_err == HV_ECHANNEL) goto out_err; err = -EEXIST; if (__ldc_channel_exists(id)) goto out_err; mssbuf = NULL; lp = kzalloc(sizeof(*lp), GFP_KERNEL); err = -ENOMEM; if (!lp) goto out_err; spin_lock_init(&lp->lock); lp->id = id; err = ldc_iommu_init(lp); if (err) goto out_free_ldc; lp->mops = mops; lp->mss = mss; lp->cfg = *cfgp; if (!lp->cfg.mtu) lp->cfg.mtu = LDC_DEFAULT_MTU; if (lp->cfg.mode == LDC_MODE_STREAM) { mssbuf = kzalloc(lp->cfg.mtu, GFP_KERNEL); if (!mssbuf) { err = -ENOMEM; goto out_free_iommu; } lp->mssbuf = mssbuf; } lp->event_arg = event_arg; /* XXX allow setting via ldc_channel_config to override defaults * XXX or use some formula based upon mtu */ lp->tx_num_entries = LDC_DEFAULT_NUM_ENTRIES; lp->rx_num_entries = LDC_DEFAULT_NUM_ENTRIES; err = alloc_queue("TX", lp->tx_num_entries, &lp->tx_base, &lp->tx_ra); if (err) goto out_free_mssbuf; err = alloc_queue("RX", lp->rx_num_entries, &lp->rx_base, &lp->rx_ra); if (err) goto out_free_txq; lp->flags |= LDC_FLAG_ALLOCED_QUEUES; lp->hs_state = LDC_HS_CLOSED; ldc_set_state(lp, LDC_STATE_INIT); INIT_HLIST_NODE(&lp->list); hlist_add_head(&lp->list, &ldc_channel_list); INIT_HLIST_HEAD(&lp->mh_list); return lp; out_free_txq: free_queue(lp->tx_num_entries, lp->tx_base); out_free_mssbuf: if (mssbuf) kfree(mssbuf); out_free_iommu: ldc_iommu_release(lp); out_free_ldc: kfree(lp); out_err: return ERR_PTR(err); } EXPORT_SYMBOL(ldc_alloc); void ldc_free(struct ldc_channel *lp) { if (lp->flags & LDC_FLAG_REGISTERED_IRQS) { free_irq(lp->cfg.rx_irq, lp); free_irq(lp->cfg.tx_irq, lp); } if (lp->flags & LDC_FLAG_REGISTERED_QUEUES) { sun4v_ldc_tx_qconf(lp->id, 0, 0); sun4v_ldc_rx_qconf(lp->id, 0, 0); lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES; } if (lp->flags & LDC_FLAG_ALLOCED_QUEUES) { free_queue(lp->tx_num_entries, lp->tx_base); free_queue(lp->rx_num_entries, lp->rx_base); lp->flags &= ~LDC_FLAG_ALLOCED_QUEUES; } hlist_del(&lp->list); if (lp->mssbuf) kfree(lp->mssbuf); ldc_iommu_release(lp); kfree(lp); } EXPORT_SYMBOL(ldc_free); /* Bind the channel. This registers the LDC queues with * the hypervisor and puts the channel into a pseudo-listening * state. This does not initiate a handshake, ldc_connect() does * that. */ int ldc_bind(struct ldc_channel *lp, const char *name) { unsigned long hv_err, flags; int err = -EINVAL; spin_lock_irqsave(&lp->lock, flags); if (!name) goto out_err; if (lp->state != LDC_STATE_INIT) goto out_err; snprintf(lp->rx_irq_name, LDC_IRQ_NAME_MAX, "%s RX", name); snprintf(lp->tx_irq_name, LDC_IRQ_NAME_MAX, "%s TX", name); err = request_irq(lp->cfg.rx_irq, ldc_rx, IRQF_SAMPLE_RANDOM | IRQF_SHARED, lp->rx_irq_name, lp); if (err) goto out_err; err = request_irq(lp->cfg.tx_irq, ldc_tx, IRQF_SAMPLE_RANDOM | IRQF_SHARED, lp->tx_irq_name, lp); if (err) goto out_free_rx_irq; lp->flags |= LDC_FLAG_REGISTERED_IRQS; err = -ENODEV; hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0); if (hv_err) goto out_free_tx_irq; hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries); if (hv_err) goto out_free_tx_irq; hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0); if (hv_err) goto out_unmap_tx; hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries); if (hv_err) goto out_unmap_tx; lp->flags |= LDC_FLAG_REGISTERED_QUEUES; hv_err = sun4v_ldc_tx_get_state(lp->id, &lp->tx_head, &lp->tx_tail, &lp->chan_state); err = -EBUSY; if (hv_err) goto out_unmap_rx; lp->tx_acked = lp->tx_head; lp->hs_state = LDC_HS_OPEN; ldc_set_state(lp, LDC_STATE_BOUND); spin_unlock_irqrestore(&lp->lock, flags); return 0; out_unmap_rx: lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES; sun4v_ldc_rx_qconf(lp->id, 0, 0); out_unmap_tx: sun4v_ldc_tx_qconf(lp->id, 0, 0); out_free_tx_irq: lp->flags &= ~LDC_FLAG_REGISTERED_IRQS; free_irq(lp->cfg.tx_irq, lp); out_free_rx_irq: free_irq(lp->cfg.rx_irq, lp); out_err: spin_unlock_irqrestore(&lp->lock, flags); return err; } EXPORT_SYMBOL(ldc_bind); int ldc_connect(struct ldc_channel *lp) { unsigned long flags; int err; if (lp->cfg.mode == LDC_MODE_RAW) return -EINVAL; spin_lock_irqsave(&lp->lock, flags); if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) || !(lp->flags & LDC_FLAG_REGISTERED_QUEUES) || lp->hs_state != LDC_HS_OPEN) err = -EINVAL; else err = start_handshake(lp); spin_unlock_irqrestore(&lp->lock, flags); return err; } EXPORT_SYMBOL(ldc_connect); int ldc_disconnect(struct ldc_channel *lp) { unsigned long hv_err, flags; int err; if (lp->cfg.mode == LDC_MODE_RAW) return -EINVAL; if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) || !(lp->flags & LDC_FLAG_REGISTERED_QUEUES)) return -EINVAL; spin_lock_irqsave(&lp->lock, flags); err = -ENODEV; hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0); if (hv_err) goto out_err; hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries); if (hv_err) goto out_err; hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0); if (hv_err) goto out_err; hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries); if (hv_err) goto out_err; ldc_set_state(lp, LDC_STATE_BOUND); lp->hs_state = LDC_HS_OPEN; lp->flags |= LDC_FLAG_RESET; spin_unlock_irqrestore(&lp->lock, flags); return 0; out_err: sun4v_ldc_tx_qconf(lp->id, 0, 0); sun4v_ldc_rx_qconf(lp->id, 0, 0); free_irq(lp->cfg.tx_irq, lp); free_irq(lp->cfg.rx_irq, lp); lp->flags &= ~(LDC_FLAG_REGISTERED_IRQS | LDC_FLAG_REGISTERED_QUEUES); ldc_set_state(lp, LDC_STATE_INIT); spin_unlock_irqrestore(&lp->lock, flags); return err; } EXPORT_SYMBOL(ldc_disconnect); int ldc_state(struct ldc_channel *lp) { return lp->state; } EXPORT_SYMBOL(ldc_state); static int write_raw(struct ldc_channel *lp, const void *buf, unsigned int size) { struct ldc_packet *p; unsigned long new_tail; int err; if (size > LDC_PACKET_SIZE) return -EMSGSIZE; p = data_get_tx_packet(lp, &new_tail); if (!p) return -EAGAIN; memcpy(p, buf, size); err = send_tx_packet(lp, p, new_tail); if (!err) err = size; return err; } static int read_raw(struct ldc_channel *lp, void *buf, unsigned int size) { struct ldc_packet *p; unsigned long hv_err, new; int err; if (size < LDC_PACKET_SIZE) return -EINVAL; hv_err = sun4v_ldc_rx_get_state(lp->id, &lp->rx_head, &lp->rx_tail, &lp->chan_state); if (hv_err) return ldc_abort(lp); if (lp->chan_state == LDC_CHANNEL_DOWN || lp->chan_state == LDC_CHANNEL_RESETTING) return -ECONNRESET; if (lp->rx_head == lp->rx_tail) return 0; p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE); memcpy(buf, p, LDC_PACKET_SIZE); new = rx_advance(lp, lp->rx_head); lp->rx_head = new; err = __set_rx_head(lp, new); if (err < 0) err = -ECONNRESET; else err = LDC_PACKET_SIZE; return err; } static const struct ldc_mode_ops raw_ops = { .write = write_raw, .read = read_raw, }; static int write_nonraw(struct ldc_channel *lp, const void *buf, unsigned int size) { unsigned long hv_err, tail; unsigned int copied; u32 seq; int err; hv_err = sun4v_ldc_tx_get_state(lp->id, &lp->tx_head, &lp->tx_tail, &lp->chan_state); if (unlikely(hv_err)) return -EBUSY; if (unlikely(lp->chan_state != LDC_CHANNEL_UP)) return ldc_abort(lp); if (!tx_has_space_for(lp, size)) return -EAGAIN; seq = lp->snd_nxt; copied = 0; tail = lp->tx_tail; while (copied < size) { struct ldc_packet *p = lp->tx_base + (tail / LDC_PACKET_SIZE); u8 *data = ((lp->cfg.mode == LDC_MODE_UNRELIABLE) ? p->u.u_data : p->u.r.r_data); int data_len; p->type = LDC_DATA; p->stype = LDC_INFO; p->ctrl = 0; data_len = size - copied; if (data_len > lp->mss) data_len = lp->mss; BUG_ON(data_len > LDC_LEN); p->env = (data_len | (copied == 0 ? LDC_START : 0) | (data_len == size - copied ? LDC_STOP : 0)); p->seqid = ++seq; ldcdbg(DATA, "SENT DATA [%02x:%02x:%02x:%02x:%08x]\n", p->type, p->stype, p->ctrl, p->env, p->seqid); memcpy(data, buf, data_len); buf += data_len; copied += data_len; tail = tx_advance(lp, tail); } err = set_tx_tail(lp, tail); if (!err) { lp->snd_nxt = seq; err = size; } return err; } static int rx_bad_seq(struct ldc_channel *lp, struct ldc_packet *p, struct ldc_packet *first_frag) { int err; if (first_frag) lp->rcv_nxt = first_frag->seqid - 1; err = send_data_nack(lp, p); if (err) return err; err = __set_rx_head(lp, lp->rx_tail); if (err < 0) return ldc_abort(lp); return 0; } static int data_ack_nack(struct ldc_channel *lp, struct ldc_packet *p) { if (p->stype & LDC_ACK) { int err = process_data_ack(lp, p); if (err) return err; } if (p->stype & LDC_NACK) return ldc_abort(lp); return 0; } static int rx_data_wait(struct ldc_channel *lp, unsigned long cur_head) { unsigned long dummy; int limit = 1000; ldcdbg(DATA, "DATA WAIT cur_head[%lx] rx_head[%lx] rx_tail[%lx]\n", cur_head, lp->rx_head, lp->rx_tail); while (limit-- > 0) { unsigned long hv_err; hv_err = sun4v_ldc_rx_get_state(lp->id, &dummy, &lp->rx_tail, &lp->chan_state); if (hv_err) return ldc_abort(lp); if (lp->chan_state == LDC_CHANNEL_DOWN || lp->chan_state == LDC_CHANNEL_RESETTING) return -ECONNRESET; if (cur_head != lp->rx_tail) { ldcdbg(DATA, "DATA WAIT DONE " "head[%lx] tail[%lx] chan_state[%lx]\n", dummy, lp->rx_tail, lp->chan_state); return 0; } udelay(1); } return -EAGAIN; } static int rx_set_head(struct ldc_channel *lp, unsigned long head) { int err = __set_rx_head(lp, head); if (err < 0) return ldc_abort(lp); lp->rx_head = head; return 0; } static void send_data_ack(struct ldc_channel *lp) { unsigned long new_tail; struct ldc_packet *p; p = data_get_tx_packet(lp, &new_tail); if (likely(p)) { int err; memset(p, 0, sizeof(*p)); p->type = LDC_DATA; p->stype = LDC_ACK; p->ctrl = 0; p->seqid = lp->snd_nxt + 1; p->u.r.ackid = lp->rcv_nxt; err = send_tx_packet(lp, p, new_tail); if (!err) lp->snd_nxt++; } } static int read_nonraw(struct ldc_channel *lp, void *buf, unsigned int size) { struct ldc_packet *first_frag; unsigned long hv_err, new; int err, copied; hv_err = sun4v_ldc_rx_get_state(lp->id, &lp->rx_head, &lp->rx_tail, &lp->chan_state); if (hv_err) return ldc_abort(lp); if (lp->chan_state == LDC_CHANNEL_DOWN || lp->chan_state == LDC_CHANNEL_RESETTING) return -ECONNRESET; if (lp->rx_head == lp->rx_tail) return 0; first_frag = NULL; copied = err = 0; new = lp->rx_head; while (1) { struct ldc_packet *p; int pkt_len; BUG_ON(new == lp->rx_tail); p = lp->rx_base + (new / LDC_PACKET_SIZE); ldcdbg(RX, "RX read pkt[%02x:%02x:%02x:%02x:%08x:%08x] " "rcv_nxt[%08x]\n", p->type, p->stype, p->ctrl, p->env, p->seqid, p->u.r.ackid, lp->rcv_nxt); if (unlikely(!rx_seq_ok(lp, p->seqid))) { err = rx_bad_seq(lp, p, first_frag); copied = 0; break; } if (p->type & LDC_CTRL) { err = process_control_frame(lp, p); if (err < 0) break; err = 0; } lp->rcv_nxt = p->seqid; if (!(p->type & LDC_DATA)) { new = rx_advance(lp, new); goto no_data; } if (p->stype & (LDC_ACK | LDC_NACK)) { err = data_ack_nack(lp, p); if (err) break; } if (!(p->stype & LDC_INFO)) { new = rx_advance(lp, new); err = rx_set_head(lp, new); if (err) break; goto no_data; } pkt_len = p->env & LDC_LEN; /* Every initial packet starts with the START bit set. * * Singleton packets will have both START+STOP set. * * Fragments will have START set in the first frame, STOP * set in the last frame, and neither bit set in middle * frames of the packet. * * Therefore if we are at the beginning of a packet and * we don't see START, or we are in the middle of a fragmented * packet and do see START, we are unsynchronized and should * flush the RX queue. */ if ((first_frag == NULL && !(p->env & LDC_START)) || (first_frag != NULL && (p->env & LDC_START))) { if (!first_frag) new = rx_advance(lp, new); err = rx_set_head(lp, new); if (err) break; if (!first_frag) goto no_data; } if (!first_frag) first_frag = p; if (pkt_len > size - copied) { /* User didn't give us a big enough buffer, * what to do? This is a pretty serious error. * * Since we haven't updated the RX ring head to * consume any of the packets, signal the error * to the user and just leave the RX ring alone. * * This seems the best behavior because this allows * a user of the LDC layer to start with a small * RX buffer for ldc_read() calls and use -EMSGSIZE * as a cue to enlarge it's read buffer. */ err = -EMSGSIZE; break; } /* Ok, we are gonna eat this one. */ new = rx_advance(lp, new); memcpy(buf, (lp->cfg.mode == LDC_MODE_UNRELIABLE ? p->u.u_data : p->u.r.r_data), pkt_len); buf += pkt_len; copied += pkt_len; if (p->env & LDC_STOP) break; no_data: if (new == lp->rx_tail) { err = rx_data_wait(lp, new); if (err) break; } } if (!err) err = rx_set_head(lp, new); if (err && first_frag) lp->rcv_nxt = first_frag->seqid - 1; if (!err) { err = copied; if (err > 0 && lp->cfg.mode != LDC_MODE_UNRELIABLE) send_data_ack(lp); } return err; } static const struct ldc_mode_ops nonraw_ops = { .write = write_nonraw, .read = read_nonraw, }; static int write_stream(struct ldc_channel *lp, const void *buf, unsigned int size) { if (size > lp->cfg.mtu) size = lp->cfg.mtu; return write_nonraw(lp, buf, size); } static int read_stream(struct ldc_channel *lp, void *buf, unsigned int size) { if (!lp->mssbuf_len) { int err = read_nonraw(lp, lp->mssbuf, lp->cfg.mtu); if (err < 0) return err; lp->mssbuf_len = err; lp->mssbuf_off = 0; } if (size > lp->mssbuf_len) size = lp->mssbuf_len; memcpy(buf, lp->mssbuf + lp->mssbuf_off, size); lp->mssbuf_off += size; lp->mssbuf_len -= size; return size; } static const struct ldc_mode_ops stream_ops = { .write = write_stream, .read = read_stream, }; int ldc_write(struct ldc_channel *lp, const void *buf, unsigned int size) { unsigned long flags; int err; if (!buf) return -EINVAL; if (!size) return 0; spin_lock_irqsave(&lp->lock, flags); if (lp->hs_state != LDC_HS_COMPLETE) err = -ENOTCONN; else err = lp->mops->write(lp, buf, size); spin_unlock_irqrestore(&lp->lock, flags); return err; } EXPORT_SYMBOL(ldc_write); int ldc_read(struct ldc_channel *lp, void *buf, unsigned int size) { unsigned long flags; int err; if (!buf) return -EINVAL; if (!size) return 0; spin_lock_irqsave(&lp->lock, flags); if (lp->hs_state != LDC_HS_COMPLETE) err = -ENOTCONN; else err = lp->mops->read(lp, buf, size); spin_unlock_irqrestore(&lp->lock, flags); return err; } EXPORT_SYMBOL(ldc_read); static long arena_alloc(struct ldc_iommu *iommu, unsigned long npages) { struct iommu_arena *arena = &iommu->arena; unsigned long n, i, start, end, limit; int pass; limit = arena->limit; start = arena->hint; pass = 0; again: n = find_next_zero_bit(arena->map, limit, start); end = n + npages; if (unlikely(end >= limit)) { if (likely(pass < 1)) { limit = start; start = 0; pass++; goto again; } else { /* Scanned the whole thing, give up. */ return -1; } } for (i = n; i < end; i++) { if (test_bit(i, arena->map)) { start = i + 1; goto again; } } for (i = n; i < end; i++) __set_bit(i, arena->map); arena->hint = end; return n; } #define COOKIE_PGSZ_CODE 0xf000000000000000ULL #define COOKIE_PGSZ_CODE_SHIFT 60ULL static u64 pagesize_code(void) { switch (PAGE_SIZE) { default: case (8ULL * 1024ULL): return 0; case (64ULL * 1024ULL): return 1; case (512ULL * 1024ULL): return 2; case (4ULL * 1024ULL * 1024ULL): return 3; case (32ULL * 1024ULL * 1024ULL): return 4; case (256ULL * 1024ULL * 1024ULL): return 5; } } static u64 make_cookie(u64 index, u64 pgsz_code, u64 page_offset) { return ((pgsz_code << COOKIE_PGSZ_CODE_SHIFT) | (index << PAGE_SHIFT) | page_offset); } static u64 cookie_to_index(u64 cookie, unsigned long *shift) { u64 szcode = cookie >> COOKIE_PGSZ_CODE_SHIFT; cookie &= ~COOKIE_PGSZ_CODE; *shift = szcode * 3; return (cookie >> (13ULL + (szcode * 3ULL))); } static struct ldc_mtable_entry *alloc_npages(struct ldc_iommu *iommu, unsigned long npages) { long entry; entry = arena_alloc(iommu, npages); if (unlikely(entry < 0)) return NULL; return iommu->page_table + entry; } static u64 perm_to_mte(unsigned int map_perm) { u64 mte_base; mte_base = pagesize_code(); if (map_perm & LDC_MAP_SHADOW) { if (map_perm & LDC_MAP_R) mte_base |= LDC_MTE_COPY_R; if (map_perm & LDC_MAP_W) mte_base |= LDC_MTE_COPY_W; } if (map_perm & LDC_MAP_DIRECT) { if (map_perm & LDC_MAP_R) mte_base |= LDC_MTE_READ; if (map_perm & LDC_MAP_W) mte_base |= LDC_MTE_WRITE; if (map_perm & LDC_MAP_X) mte_base |= LDC_MTE_EXEC; } if (map_perm & LDC_MAP_IO) { if (map_perm & LDC_MAP_R) mte_base |= LDC_MTE_IOMMU_R; if (map_perm & LDC_MAP_W) mte_base |= LDC_MTE_IOMMU_W; } return mte_base; } static int pages_in_region(unsigned long base, long len) { int count = 0; do { unsigned long new = (base + PAGE_SIZE) & PAGE_MASK; len -= (new - base); base = new; count++; } while (len > 0); return count; } struct cookie_state { struct ldc_mtable_entry *page_table; struct ldc_trans_cookie *cookies; u64 mte_base; u64 prev_cookie; u32 pte_idx; u32 nc; }; static void fill_cookies(struct cookie_state *sp, unsigned long pa, unsigned long off, unsigned long len) { do { unsigned long tlen, new = pa + PAGE_SIZE; u64 this_cookie; sp->page_table[sp->pte_idx].mte = sp->mte_base | pa; tlen = PAGE_SIZE; if (off) tlen = PAGE_SIZE - off; if (tlen > len) tlen = len; this_cookie = make_cookie(sp->pte_idx, pagesize_code(), off); off = 0; if (this_cookie == sp->prev_cookie) { sp->cookies[sp->nc - 1].cookie_size += tlen; } else { sp->cookies[sp->nc].cookie_addr = this_cookie; sp->cookies[sp->nc].cookie_size = tlen; sp->nc++; } sp->prev_cookie = this_cookie + tlen; sp->pte_idx++; len -= tlen; pa = new; } while (len > 0); } static int sg_count_one(struct scatterlist *sg) { unsigned long base = page_to_pfn(sg_page(sg)) << PAGE_SHIFT; long len = sg->length; if ((sg->offset | len) & (8UL - 1)) return -EFAULT; return pages_in_region(base + sg->offset, len); } static int sg_count_pages(struct scatterlist *sg, int num_sg) { int count; int i; count = 0; for (i = 0; i < num_sg; i++) { int err = sg_count_one(sg + i); if (err < 0) return err; count += err; } return count; } int ldc_map_sg(struct ldc_channel *lp, struct scatterlist *sg, int num_sg, struct ldc_trans_cookie *cookies, int ncookies, unsigned int map_perm) { unsigned long i, npages, flags; struct ldc_mtable_entry *base; struct cookie_state state; struct ldc_iommu *iommu; int err; if (map_perm & ~LDC_MAP_ALL) return -EINVAL; err = sg_count_pages(sg, num_sg); if (err < 0) return err; npages = err; if (err > ncookies) return -EMSGSIZE; iommu = &lp->iommu; spin_lock_irqsave(&iommu->lock, flags); base = alloc_npages(iommu, npages); spin_unlock_irqrestore(&iommu->lock, flags); if (!base) return -ENOMEM; state.page_table = iommu->page_table; state.cookies = cookies; state.mte_base = perm_to_mte(map_perm); state.prev_cookie = ~(u64)0; state.pte_idx = (base - iommu->page_table); state.nc = 0; for (i = 0; i < num_sg; i++) fill_cookies(&state, page_to_pfn(sg[i].page) << PAGE_SHIFT, sg[i].offset, sg[i].length); return state.nc; } EXPORT_SYMBOL(ldc_map_sg); int ldc_map_single(struct ldc_channel *lp, void *buf, unsigned int len, struct ldc_trans_cookie *cookies, int ncookies, unsigned int map_perm) { unsigned long npages, pa, flags; struct ldc_mtable_entry *base; struct cookie_state state; struct ldc_iommu *iommu; if ((map_perm & ~LDC_MAP_ALL) || (ncookies < 1)) return -EINVAL; pa = __pa(buf); if ((pa | len) & (8UL - 1)) return -EFAULT; npages = pages_in_region(pa, len); iommu = &lp->iommu; spin_lock_irqsave(&iommu->lock, flags); base = alloc_npages(iommu, npages); spin_unlock_irqrestore(&iommu->lock, flags); if (!base) return -ENOMEM; state.page_table = iommu->page_table; state.cookies = cookies; state.mte_base = perm_to_mte(map_perm); state.prev_cookie = ~(u64)0; state.pte_idx = (base - iommu->page_table); state.nc = 0; fill_cookies(&state, (pa & PAGE_MASK), (pa & ~PAGE_MASK), len); BUG_ON(state.nc != 1); return state.nc; } EXPORT_SYMBOL(ldc_map_single); static void free_npages(unsigned long id, struct ldc_iommu *iommu, u64 cookie, u64 size) { struct iommu_arena *arena = &iommu->arena; unsigned long i, shift, index, npages; struct ldc_mtable_entry *base; npages = PAGE_ALIGN(((cookie & ~PAGE_MASK) + size)) >> PAGE_SHIFT; index = cookie_to_index(cookie, &shift); base = iommu->page_table + index; BUG_ON(index > arena->limit || (index + npages) > arena->limit); for (i = 0; i < npages; i++) { if (base->cookie) sun4v_ldc_revoke(id, cookie + (i << shift), base->cookie); base->mte = 0; __clear_bit(index + i, arena->map); } } void ldc_unmap(struct ldc_channel *lp, struct ldc_trans_cookie *cookies, int ncookies) { struct ldc_iommu *iommu = &lp->iommu; unsigned long flags; int i; spin_lock_irqsave(&iommu->lock, flags); for (i = 0; i < ncookies; i++) { u64 addr = cookies[i].cookie_addr; u64 size = cookies[i].cookie_size; free_npages(lp->id, iommu, addr, size); } spin_unlock_irqrestore(&iommu->lock, flags); } EXPORT_SYMBOL(ldc_unmap); int ldc_copy(struct ldc_channel *lp, int copy_dir, void *buf, unsigned int len, unsigned long offset, struct ldc_trans_cookie *cookies, int ncookies) { unsigned int orig_len; unsigned long ra; int i; if (copy_dir != LDC_COPY_IN && copy_dir != LDC_COPY_OUT) { printk(KERN_ERR PFX "ldc_copy: ID[%lu] Bad copy_dir[%d]\n", lp->id, copy_dir); return -EINVAL; } ra = __pa(buf); if ((ra | len | offset) & (8UL - 1)) { printk(KERN_ERR PFX "ldc_copy: ID[%lu] Unaligned buffer " "ra[%lx] len[%x] offset[%lx]\n", lp->id, ra, len, offset); return -EFAULT; } if (lp->hs_state != LDC_HS_COMPLETE || (lp->flags & LDC_FLAG_RESET)) { printk(KERN_ERR PFX "ldc_copy: ID[%lu] Link down hs_state[%x] " "flags[%x]\n", lp->id, lp->hs_state, lp->flags); return -ECONNRESET; } orig_len = len; for (i = 0; i < ncookies; i++) { unsigned long cookie_raddr = cookies[i].cookie_addr; unsigned long this_len = cookies[i].cookie_size; unsigned long actual_len; if (unlikely(offset)) { unsigned long this_off = offset; if (this_off > this_len) this_off = this_len; offset -= this_off; this_len -= this_off; if (!this_len) continue; cookie_raddr += this_off; } if (this_len > len) this_len = len; while (1) { unsigned long hv_err; hv_err = sun4v_ldc_copy(lp->id, copy_dir, cookie_raddr, ra, this_len, &actual_len); if (unlikely(hv_err)) { printk(KERN_ERR PFX "ldc_copy: ID[%lu] " "HV error %lu\n", lp->id, hv_err); if (lp->hs_state != LDC_HS_COMPLETE || (lp->flags & LDC_FLAG_RESET)) return -ECONNRESET; else return -EFAULT; } cookie_raddr += actual_len; ra += actual_len; len -= actual_len; if (actual_len == this_len) break; this_len -= actual_len; } if (!len) break; } /* It is caller policy what to do about short copies. * For example, a networking driver can declare the * packet a runt and drop it. */ return orig_len - len; } EXPORT_SYMBOL(ldc_copy); void *ldc_alloc_exp_dring(struct ldc_channel *lp, unsigned int len, struct ldc_trans_cookie *cookies, int *ncookies, unsigned int map_perm) { void *buf; int err; if (len & (8UL - 1)) return ERR_PTR(-EINVAL); buf = kzalloc(len, GFP_KERNEL); if (!buf) return ERR_PTR(-ENOMEM); err = ldc_map_single(lp, buf, len, cookies, *ncookies, map_perm); if (err < 0) { kfree(buf); return ERR_PTR(err); } *ncookies = err; return buf; } EXPORT_SYMBOL(ldc_alloc_exp_dring); void ldc_free_exp_dring(struct ldc_channel *lp, void *buf, unsigned int len, struct ldc_trans_cookie *cookies, int ncookies) { ldc_unmap(lp, cookies, ncookies); kfree(buf); } EXPORT_SYMBOL(ldc_free_exp_dring); static int __init ldc_init(void) { unsigned long major, minor; struct mdesc_handle *hp; const u64 *v; u64 mp; hp = mdesc_grab(); if (!hp) return -ENODEV; mp = mdesc_node_by_name(hp, MDESC_NODE_NULL, "platform"); if (mp == MDESC_NODE_NULL) return -ENODEV; v = mdesc_get_property(hp, mp, "domaining-enabled", NULL); if (!v) return -ENODEV; major = 1; minor = 0; if (sun4v_hvapi_register(HV_GRP_LDOM, major, &minor)) { printk(KERN_INFO PFX "Could not register LDOM hvapi.\n"); return -ENODEV; } printk(KERN_INFO "%s", version); if (!*v) { printk(KERN_INFO PFX "Domaining disabled.\n"); return -ENODEV; } ldom_domaining_enabled = 1; return 0; } core_initcall(ldc_init);