/* Copyright (c) 2006 Coraid, Inc. See COPYING for GPL terms. */ /* * aoecmd.c * Filesystem request handling methods */ #include #include #include #include #include #include #include "aoe.h" #define TIMERTICK (HZ / 10) #define MINTIMER (2 * TIMERTICK) #define MAXTIMER (HZ << 1) static int aoe_deadsecs = 60 * 3; module_param(aoe_deadsecs, int, 0644); MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev."); struct sk_buff * new_skb(ulong len) { struct sk_buff *skb; skb = alloc_skb(len, GFP_ATOMIC); if (skb) { skb->nh.raw = skb->mac.raw = skb->data; skb->protocol = __constant_htons(ETH_P_AOE); skb->priority = 0; skb_put(skb, len); memset(skb->head, 0, len); skb->next = skb->prev = NULL; /* tell the network layer not to perform IP checksums * or to get the NIC to do it */ skb->ip_summed = CHECKSUM_NONE; } return skb; } static struct frame * getframe(struct aoedev *d, int tag) { struct frame *f, *e; f = d->frames; e = f + d->nframes; for (; ftag == tag) return f; return NULL; } /* * Leave the top bit clear so we have tagspace for userland. * The bottom 16 bits are the xmit tick for rexmit/rttavg processing. * This driver reserves tag -1 to mean "unused frame." */ static int newtag(struct aoedev *d) { register ulong n; n = jiffies & 0xffff; return n |= (++d->lasttag & 0x7fff) << 16; } static int aoehdr_atainit(struct aoedev *d, struct aoe_hdr *h) { u32 host_tag = newtag(d); memcpy(h->src, d->ifp->dev_addr, sizeof h->src); memcpy(h->dst, d->addr, sizeof h->dst); h->type = __constant_cpu_to_be16(ETH_P_AOE); h->verfl = AOE_HVER; h->major = cpu_to_be16(d->aoemajor); h->minor = d->aoeminor; h->cmd = AOECMD_ATA; h->tag = cpu_to_be32(host_tag); return host_tag; } static inline void put_lba(struct aoe_atahdr *ah, sector_t lba) { ah->lba0 = lba; ah->lba1 = lba >>= 8; ah->lba2 = lba >>= 8; ah->lba3 = lba >>= 8; ah->lba4 = lba >>= 8; ah->lba5 = lba >>= 8; } static void aoecmd_ata_rw(struct aoedev *d, struct frame *f) { struct aoe_hdr *h; struct aoe_atahdr *ah; struct buf *buf; struct sk_buff *skb; ulong bcnt; register sector_t sector; char writebit, extbit; writebit = 0x10; extbit = 0x4; buf = d->inprocess; sector = buf->sector; bcnt = buf->bv_resid; if (bcnt > d->maxbcnt) bcnt = d->maxbcnt; /* initialize the headers & frame */ skb = f->skb; h = (struct aoe_hdr *) skb->mac.raw; ah = (struct aoe_atahdr *) (h+1); skb->len = sizeof *h + sizeof *ah; memset(h, 0, ETH_ZLEN); f->tag = aoehdr_atainit(d, h); f->waited = 0; f->buf = buf; f->bufaddr = buf->bufaddr; f->bcnt = bcnt; f->lba = sector; /* set up ata header */ ah->scnt = bcnt >> 9; put_lba(ah, sector); if (d->flags & DEVFL_EXT) { ah->aflags |= AOEAFL_EXT; } else { extbit = 0; ah->lba3 &= 0x0f; ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */ } if (bio_data_dir(buf->bio) == WRITE) { skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr), offset_in_page(f->bufaddr), bcnt); ah->aflags |= AOEAFL_WRITE; skb->len += bcnt; skb->data_len = bcnt; } else { skb->len = ETH_ZLEN; writebit = 0; } ah->cmdstat = WIN_READ | writebit | extbit; /* mark all tracking fields and load out */ buf->nframesout += 1; buf->bufaddr += bcnt; buf->bv_resid -= bcnt; /* printk(KERN_DEBUG "aoe: bv_resid=%ld\n", buf->bv_resid); */ buf->resid -= bcnt; buf->sector += bcnt >> 9; if (buf->resid == 0) { d->inprocess = NULL; } else if (buf->bv_resid == 0) { buf->bv++; WARN_ON(buf->bv->bv_len == 0); buf->bv_resid = buf->bv->bv_len; buf->bufaddr = page_address(buf->bv->bv_page) + buf->bv->bv_offset; } skb->dev = d->ifp; skb = skb_clone(skb, GFP_ATOMIC); if (skb == NULL) return; if (d->sendq_hd) d->sendq_tl->next = skb; else d->sendq_hd = skb; d->sendq_tl = skb; } /* some callers cannot sleep, and they can call this function, * transmitting the packets later, when interrupts are on */ static struct sk_buff * aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff **tail) { struct aoe_hdr *h; struct aoe_cfghdr *ch; struct sk_buff *skb, *sl, *sl_tail; struct net_device *ifp; sl = sl_tail = NULL; read_lock(&dev_base_lock); for (ifp = dev_base; ifp; dev_put(ifp), ifp = ifp->next) { dev_hold(ifp); if (!is_aoe_netif(ifp)) continue; skb = new_skb(sizeof *h + sizeof *ch); if (skb == NULL) { printk(KERN_INFO "aoe: skb alloc failure\n"); continue; } skb->dev = ifp; if (sl_tail == NULL) sl_tail = skb; h = (struct aoe_hdr *) skb->mac.raw; memset(h, 0, sizeof *h + sizeof *ch); memset(h->dst, 0xff, sizeof h->dst); memcpy(h->src, ifp->dev_addr, sizeof h->src); h->type = __constant_cpu_to_be16(ETH_P_AOE); h->verfl = AOE_HVER; h->major = cpu_to_be16(aoemajor); h->minor = aoeminor; h->cmd = AOECMD_CFG; skb->next = sl; sl = skb; } read_unlock(&dev_base_lock); if (tail != NULL) *tail = sl_tail; return sl; } static struct frame * freeframe(struct aoedev *d) { struct frame *f, *e; int n = 0; f = d->frames; e = f + d->nframes; for (; ftag != FREETAG) continue; if (atomic_read(&skb_shinfo(f->skb)->dataref) == 1) { skb_shinfo(f->skb)->nr_frags = f->skb->data_len = 0; return f; } n++; } if (n == d->nframes) /* wait for network layer */ d->flags |= DEVFL_KICKME; return NULL; } /* enters with d->lock held */ void aoecmd_work(struct aoedev *d) { struct frame *f; struct buf *buf; if (d->flags & DEVFL_PAUSE) { if (!aoedev_isbusy(d)) d->sendq_hd = aoecmd_cfg_pkts(d->aoemajor, d->aoeminor, &d->sendq_tl); return; } loop: f = freeframe(d); if (f == NULL) return; if (d->inprocess == NULL) { if (list_empty(&d->bufq)) return; buf = container_of(d->bufq.next, struct buf, bufs); list_del(d->bufq.next); /*printk(KERN_DEBUG "aoe: bi_size=%ld\n", buf->bio->bi_size); */ d->inprocess = buf; } aoecmd_ata_rw(d, f); goto loop; } static void rexmit(struct aoedev *d, struct frame *f) { struct sk_buff *skb; struct aoe_hdr *h; struct aoe_atahdr *ah; char buf[128]; u32 n; n = newtag(d); snprintf(buf, sizeof buf, "%15s e%ld.%ld oldtag=%08x@%08lx newtag=%08x\n", "retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n); aoechr_error(buf); skb = f->skb; h = (struct aoe_hdr *) skb->mac.raw; ah = (struct aoe_atahdr *) (h+1); f->tag = n; h->tag = cpu_to_be32(n); memcpy(h->dst, d->addr, sizeof h->dst); memcpy(h->src, d->ifp->dev_addr, sizeof h->src); n = DEFAULTBCNT / 512; if (ah->scnt > n) { ah->scnt = n; if (ah->aflags & AOEAFL_WRITE) { skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr), offset_in_page(f->bufaddr), DEFAULTBCNT); skb->len = sizeof *h + sizeof *ah + DEFAULTBCNT; skb->data_len = DEFAULTBCNT; } if (++d->lostjumbo > (d->nframes << 1)) if (d->maxbcnt != DEFAULTBCNT) { printk(KERN_INFO "aoe: e%ld.%ld: too many lost jumbo on %s - using 1KB frames.\n", d->aoemajor, d->aoeminor, d->ifp->name); d->maxbcnt = DEFAULTBCNT; d->flags |= DEVFL_MAXBCNT; } } skb->dev = d->ifp; skb = skb_clone(skb, GFP_ATOMIC); if (skb == NULL) return; if (d->sendq_hd) d->sendq_tl->next = skb; else d->sendq_hd = skb; d->sendq_tl = skb; } static int tsince(int tag) { int n; n = jiffies & 0xffff; n -= tag & 0xffff; if (n < 0) n += 1<<16; return n; } static void rexmit_timer(ulong vp) { struct aoedev *d; struct frame *f, *e; struct sk_buff *sl; register long timeout; ulong flags, n; d = (struct aoedev *) vp; sl = NULL; /* timeout is always ~150% of the moving average */ timeout = d->rttavg; timeout += timeout >> 1; spin_lock_irqsave(&d->lock, flags); if (d->flags & DEVFL_TKILL) { spin_unlock_irqrestore(&d->lock, flags); return; } f = d->frames; e = f + d->nframes; for (; ftag != FREETAG && tsince(f->tag) >= timeout) { n = f->waited += timeout; n /= HZ; if (n > aoe_deadsecs) { /* waited too long for response */ aoedev_downdev(d); break; } rexmit(d, f); } } if (d->flags & DEVFL_KICKME) { d->flags &= ~DEVFL_KICKME; aoecmd_work(d); } sl = d->sendq_hd; d->sendq_hd = d->sendq_tl = NULL; if (sl) { n = d->rttavg <<= 1; if (n > MAXTIMER) d->rttavg = MAXTIMER; } d->timer.expires = jiffies + TIMERTICK; add_timer(&d->timer); spin_unlock_irqrestore(&d->lock, flags); aoenet_xmit(sl); } /* this function performs work that has been deferred until sleeping is OK */ void aoecmd_sleepwork(void *vp) { struct aoedev *d = (struct aoedev *) vp; if (d->flags & DEVFL_GDALLOC) aoeblk_gdalloc(d); if (d->flags & DEVFL_NEWSIZE) { struct block_device *bd; unsigned long flags; u64 ssize; ssize = d->gd->capacity; bd = bdget_disk(d->gd, 0); if (bd) { mutex_lock(&bd->bd_inode->i_mutex); i_size_write(bd->bd_inode, (loff_t)ssize<<9); mutex_unlock(&bd->bd_inode->i_mutex); bdput(bd); } spin_lock_irqsave(&d->lock, flags); d->flags |= DEVFL_UP; d->flags &= ~DEVFL_NEWSIZE; spin_unlock_irqrestore(&d->lock, flags); } } static void ataid_complete(struct aoedev *d, unsigned char *id) { u64 ssize; u16 n; /* word 83: command set supported */ n = le16_to_cpu(get_unaligned((__le16 *) &id[83<<1])); /* word 86: command set/feature enabled */ n |= le16_to_cpu(get_unaligned((__le16 *) &id[86<<1])); if (n & (1<<10)) { /* bit 10: LBA 48 */ d->flags |= DEVFL_EXT; /* word 100: number lba48 sectors */ ssize = le64_to_cpu(get_unaligned((__le64 *) &id[100<<1])); /* set as in ide-disk.c:init_idedisk_capacity */ d->geo.cylinders = ssize; d->geo.cylinders /= (255 * 63); d->geo.heads = 255; d->geo.sectors = 63; } else { d->flags &= ~DEVFL_EXT; /* number lba28 sectors */ ssize = le32_to_cpu(get_unaligned((__le32 *) &id[60<<1])); /* NOTE: obsolete in ATA 6 */ d->geo.cylinders = le16_to_cpu(get_unaligned((__le16 *) &id[54<<1])); d->geo.heads = le16_to_cpu(get_unaligned((__le16 *) &id[55<<1])); d->geo.sectors = le16_to_cpu(get_unaligned((__le16 *) &id[56<<1])); } if (d->ssize != ssize) printk(KERN_INFO "aoe: %012llx e%lu.%lu v%04x has %llu sectors\n", (unsigned long long)mac_addr(d->addr), d->aoemajor, d->aoeminor, d->fw_ver, (long long)ssize); d->ssize = ssize; d->geo.start = 0; if (d->gd != NULL) { d->gd->capacity = ssize; d->flags |= DEVFL_NEWSIZE; } else { if (d->flags & DEVFL_GDALLOC) { printk(KERN_ERR "aoe: can't schedule work for e%lu.%lu, %s\n", d->aoemajor, d->aoeminor, "it's already on! This shouldn't happen.\n"); return; } d->flags |= DEVFL_GDALLOC; } schedule_work(&d->work); } static void calc_rttavg(struct aoedev *d, int rtt) { register long n; n = rtt; if (n < 0) { n = -rtt; if (n < MINTIMER) n = MINTIMER; else if (n > MAXTIMER) n = MAXTIMER; d->mintimer += (n - d->mintimer) >> 1; } else if (n < d->mintimer) n = d->mintimer; else if (n > MAXTIMER) n = MAXTIMER; /* g == .25; cf. Congestion Avoidance and Control, Jacobson & Karels; 1988 */ n -= d->rttavg; d->rttavg += n >> 2; } void aoecmd_ata_rsp(struct sk_buff *skb) { struct aoedev *d; struct aoe_hdr *hin, *hout; struct aoe_atahdr *ahin, *ahout; struct frame *f; struct buf *buf; struct sk_buff *sl; register long n; ulong flags; char ebuf[128]; u16 aoemajor; hin = (struct aoe_hdr *) skb->mac.raw; aoemajor = be16_to_cpu(hin->major); d = aoedev_by_aoeaddr(aoemajor, hin->minor); if (d == NULL) { snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response " "for unknown device %d.%d\n", aoemajor, hin->minor); aoechr_error(ebuf); return; } spin_lock_irqsave(&d->lock, flags); n = be32_to_cpu(hin->tag); f = getframe(d, n); if (f == NULL) { calc_rttavg(d, -tsince(n)); spin_unlock_irqrestore(&d->lock, flags); snprintf(ebuf, sizeof ebuf, "%15s e%d.%d tag=%08x@%08lx\n", "unexpected rsp", be16_to_cpu(hin->major), hin->minor, be32_to_cpu(hin->tag), jiffies); aoechr_error(ebuf); return; } calc_rttavg(d, tsince(f->tag)); ahin = (struct aoe_atahdr *) (hin+1); hout = (struct aoe_hdr *) f->skb->mac.raw; ahout = (struct aoe_atahdr *) (hout+1); buf = f->buf; if (ahout->cmdstat == WIN_IDENTIFY) d->flags &= ~DEVFL_PAUSE; if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */ printk(KERN_ERR "aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%ld\n", ahout->cmdstat, ahin->cmdstat, d->aoemajor, d->aoeminor); if (buf) buf->flags |= BUFFL_FAIL; } else { n = ahout->scnt << 9; switch (ahout->cmdstat) { case WIN_READ: case WIN_READ_EXT: if (skb->len - sizeof *hin - sizeof *ahin < n) { printk(KERN_ERR "aoe: runt data size in read. skb->len=%d\n", skb->len); /* fail frame f? just returning will rexmit. */ spin_unlock_irqrestore(&d->lock, flags); return; } memcpy(f->bufaddr, ahin+1, n); case WIN_WRITE: case WIN_WRITE_EXT: if (f->bcnt -= n) { skb = f->skb; f->bufaddr += n; put_lba(ahout, f->lba += ahout->scnt); n = f->bcnt; if (n > DEFAULTBCNT) n = DEFAULTBCNT; ahout->scnt = n >> 9; if (ahout->aflags & AOEAFL_WRITE) { skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr), offset_in_page(f->bufaddr), n); skb->len = sizeof *hout + sizeof *ahout + n; skb->data_len = n; } f->tag = newtag(d); hout->tag = cpu_to_be32(f->tag); skb->dev = d->ifp; skb = skb_clone(skb, GFP_ATOMIC); spin_unlock_irqrestore(&d->lock, flags); if (skb) aoenet_xmit(skb); return; } if (n > DEFAULTBCNT) d->lostjumbo = 0; break; case WIN_IDENTIFY: if (skb->len - sizeof *hin - sizeof *ahin < 512) { printk(KERN_INFO "aoe: runt data size in ataid. skb->len=%d\n", skb->len); spin_unlock_irqrestore(&d->lock, flags); return; } ataid_complete(d, (char *) (ahin+1)); break; default: printk(KERN_INFO "aoe: unrecognized ata command %2.2Xh for %d.%d\n", ahout->cmdstat, be16_to_cpu(hin->major), hin->minor); } } if (buf) { buf->nframesout -= 1; if (buf->nframesout == 0 && buf->resid == 0) { unsigned long duration = jiffies - buf->start_time; unsigned long n_sect = buf->bio->bi_size >> 9; struct gendisk *disk = d->gd; const int rw = bio_data_dir(buf->bio); disk_stat_inc(disk, ios[rw]); disk_stat_add(disk, ticks[rw], duration); disk_stat_add(disk, sectors[rw], n_sect); disk_stat_add(disk, io_ticks, duration); n = (buf->flags & BUFFL_FAIL) ? -EIO : 0; bio_endio(buf->bio, buf->bio->bi_size, n); mempool_free(buf, d->bufpool); } } f->buf = NULL; f->tag = FREETAG; aoecmd_work(d); sl = d->sendq_hd; d->sendq_hd = d->sendq_tl = NULL; spin_unlock_irqrestore(&d->lock, flags); aoenet_xmit(sl); } void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor) { struct sk_buff *sl; sl = aoecmd_cfg_pkts(aoemajor, aoeminor, NULL); aoenet_xmit(sl); } /* * Since we only call this in one place (and it only prepares one frame) * we just return the skb. Usually we'd chain it up to the aoedev sendq. */ static struct sk_buff * aoecmd_ata_id(struct aoedev *d) { struct aoe_hdr *h; struct aoe_atahdr *ah; struct frame *f; struct sk_buff *skb; f = freeframe(d); if (f == NULL) { printk(KERN_ERR "aoe: can't get a frame. This shouldn't happen.\n"); return NULL; } /* initialize the headers & frame */ skb = f->skb; h = (struct aoe_hdr *) skb->mac.raw; ah = (struct aoe_atahdr *) (h+1); skb->len = ETH_ZLEN; memset(h, 0, ETH_ZLEN); f->tag = aoehdr_atainit(d, h); f->waited = 0; /* set up ata header */ ah->scnt = 1; ah->cmdstat = WIN_IDENTIFY; ah->lba3 = 0xa0; skb->dev = d->ifp; d->rttavg = MAXTIMER; d->timer.function = rexmit_timer; return skb_clone(skb, GFP_ATOMIC); } void aoecmd_cfg_rsp(struct sk_buff *skb) { struct aoedev *d; struct aoe_hdr *h; struct aoe_cfghdr *ch; ulong flags, sysminor, aoemajor; struct sk_buff *sl; enum { MAXFRAMES = 16 }; u16 n; h = (struct aoe_hdr *) skb->mac.raw; ch = (struct aoe_cfghdr *) (h+1); /* * Enough people have their dip switches set backwards to * warrant a loud message for this special case. */ aoemajor = be16_to_cpu(h->major); if (aoemajor == 0xfff) { printk(KERN_ERR "aoe: Warning: shelf address is all ones. " "Check shelf dip switches.\n"); return; } sysminor = SYSMINOR(aoemajor, h->minor); if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) { printk(KERN_INFO "aoe: e%ld.%d: minor number too large\n", aoemajor, (int) h->minor); return; } n = be16_to_cpu(ch->bufcnt); if (n > MAXFRAMES) /* keep it reasonable */ n = MAXFRAMES; d = aoedev_by_sysminor_m(sysminor, n); if (d == NULL) { printk(KERN_INFO "aoe: device sysminor_m failure\n"); return; } spin_lock_irqsave(&d->lock, flags); /* permit device to migrate mac and network interface */ d->ifp = skb->dev; memcpy(d->addr, h->src, sizeof d->addr); if (!(d->flags & DEVFL_MAXBCNT)) { n = d->ifp->mtu; n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr); n /= 512; if (n > ch->scnt) n = ch->scnt; n = n ? n * 512 : DEFAULTBCNT; if (n != d->maxbcnt) { printk(KERN_INFO "aoe: e%ld.%ld: setting %d byte data frames on %s\n", d->aoemajor, d->aoeminor, n, d->ifp->name); d->maxbcnt = n; } } /* don't change users' perspective */ if (d->nopen && !(d->flags & DEVFL_PAUSE)) { spin_unlock_irqrestore(&d->lock, flags); return; } d->flags |= DEVFL_PAUSE; /* force pause */ d->mintimer = MINTIMER; d->fw_ver = be16_to_cpu(ch->fwver); /* check for already outstanding ataid */ sl = aoedev_isbusy(d) == 0 ? aoecmd_ata_id(d) : NULL; spin_unlock_irqrestore(&d->lock, flags); aoenet_xmit(sl); }