diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c --- a/drivers/net/8139too.c +++ b/drivers/net/8139too.c @@ -108,6 +108,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -1895,16 +1898,23 @@ static void rtl8139_rx_err (u32 rx_statu } #if RX_BUF_IDX == 3 -static __inline__ void wrap_copy(struct sk_buff *skb, const unsigned char *ring, +static __inline__ void __wrap_copy(void *data, const unsigned char *ring, u32 offset, unsigned int size) { u32 left = RX_BUF_LEN - offset; if (size > left) { - memcpy(skb->data, ring + offset, left); - memcpy(skb->data+left, ring, size - left); + memcpy(data, ring + offset, left); + memcpy(data+left, ring, size - left); } else - memcpy(skb->data, ring + offset, size); + memcpy(data, ring + offset, size); + +} + +static __inline__ void wrap_copy(struct sk_buff *skb, const unsigned char *ring, + u32 offset, unsigned int size) +{ + __wrap_copy(skb->data, ring, offset, size); } #endif @@ -2005,34 +2015,71 @@ no_early_rx: goto out; } - /* Malloc up new buffer, compatible with net-2e. */ - /* Omit the four octet CRC from the length. */ + { + u8 header[128]; + int hsize = sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct tcphdr) + 2; + int status; + void *priv; + struct zc_handler *zh; - skb = dev_alloc_skb (pkt_size + 2); - if (likely(skb)) { - skb->dev = dev; - skb_reserve (skb, 2); /* 16 byte align the IP fields. */ #if RX_BUF_IDX == 3 - wrap_copy(skb, rx_ring, ring_offset+4, pkt_size); + __wrap_copy(header, rx_ring, ring_offset+4, hsize); #else - eth_copy_and_sum (skb, &rx_ring[ring_offset + 4], pkt_size, 0); + memcpy(header, &rx_ring[ring_offset + 4], hsize); #endif - skb_put (skb, pkt_size); + + skb = alloc_skb_zerocopy(pkt_size +2 - hsize, GFP_ATOMIC, + header, sizeof(header), &priv, &zh, &status); + if (skb) { + dev->last_rx = jiffies; + tp->stats.rx_bytes += pkt_size; + tp->stats.rx_packets++; - skb->protocol = eth_type_trans (skb, dev); +#if RX_BUF_IDX == 3 + wrap_copy(skb, rx_ring, ring_offset+4 + hsize, pkt_size+2-hsize); +#else + memcpy(skb->data, &rx_ring[ring_offset + 4 + hsize], pkt_size+2-hsize); +#endif + kfree_skb_zerocopy(skb, header, hsize, priv, zh); + + } else if (status == -1) { + /* Malloc up new buffer, compatible with net-2e. */ + /* Omit the four octet CRC from the length. */ + + skb = dev_alloc_skb (pkt_size + 2); + if (likely(skb)) { + skb->dev = dev; + skb_reserve (skb, 2); /* 16 byte align the IP fields. */ +#if RX_BUF_IDX == 3 + wrap_copy(skb, rx_ring, ring_offset+4, pkt_size); +#else + eth_copy_and_sum (skb, &rx_ring[ring_offset + 4], pkt_size, 0); +#endif + skb_put (skb, pkt_size); - dev->last_rx = jiffies; - tp->stats.rx_bytes += pkt_size; - tp->stats.rx_packets++; + skb->protocol = eth_type_trans (skb, dev); - netif_receive_skb (skb); - } else { - if (net_ratelimit()) - printk (KERN_WARNING - "%s: Memory squeeze, dropping packet.\n", - dev->name); - tp->stats.rx_dropped++; + dev->last_rx = jiffies; + tp->stats.rx_bytes += pkt_size; + tp->stats.rx_packets++; + + netif_receive_skb (skb); + } else { + if (net_ratelimit()) + printk (KERN_WARNING + "%s: Memory squeeze, dropping packet.\n", + dev->name); + tp->stats.rx_dropped++; + } + } else { + if (net_ratelimit()) + printk (KERN_WARNING + "%s: Zero-copy failed, dropping packet.\n", + dev->name); + tp->stats.rx_dropped++; + } } + received++; cur_rx = (cur_rx + rx_size + 4 + 3) & ~3; diff --git a/fs/read_write.c b/fs/read_write.c --- a/fs/read_write.c +++ b/fs/read_write.c @@ -15,6 +15,8 @@ #include #include +#include + #include #include @@ -670,8 +672,15 @@ static ssize_t do_sendfile(int out_fd, i if (!(out_file->f_mode & FMODE_WRITE)) goto fput_out; retval = -EINVAL; - if (!out_file->f_op || !out_file->f_op->sendpage) + if (!out_file->f_op) + goto fput_out; + + if (!SOCKET_I(in_file->f_dentry->d_inode) && !out_file->f_op->sendpage) { + printk("%s: sock=%p, sendpage=%p.\n", __func__, + SOCKET_I(in_file->f_dentry->d_inode), out_file->f_op->sendpage); goto fput_out; + } + out_inode = out_file->f_dentry->d_inode; retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); if (retval) @@ -688,7 +697,7 @@ static ssize_t do_sendfile(int out_fd, i retval = -EINVAL; if (unlikely(pos < 0)) goto fput_out; - if (unlikely(pos + count > max)) { + if (unlikely((unsigned long long)(pos + count) > (unsigned long long)max)) { retval = -EOVERFLOW; if (pos >= max) goto fput_out; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1045,6 +1045,30 @@ static inline struct sk_buff *dev_alloc_ return __dev_alloc_skb(length, GFP_ATOMIC); } +struct zc_handler +{ + struct list_head zc_entry; + void *(* alloc_data)(struct zc_handler *zh, void *header, unsigned int header_size, unsigned int size, void **priv, int *status); + void (* commit_data)(struct zc_handler *zh, void *header, unsigned int header_size, unsigned int size, void *priv); +}; + +extern void __kfree_skb_zerocopy(struct sk_buff *skb, void *header, unsigned int header_size, + void *priv, struct zc_handler *zh); +extern struct sk_buff *__alloc_skb_zerocopy(unsigned int size, gfp_t gfp_mask, + void *header, unsigned int header_size, void **priv, struct zc_handler **zh, int *status); + +static inline struct sk_buff *alloc_skb_zerocopy(unsigned int size, gfp_t gfp_mask, + void *header, unsigned int header_size, void **priv, struct zc_handler **zh, int *status) +{ + return __alloc_skb_zerocopy(size, gfp_mask, header, header_size, priv, zh, status); +} + +static inline void kfree_skb_zerocopy(struct sk_buff *skb, + void *header, unsigned int header_size, void *priv, struct zc_handler *zh) +{ + __kfree_skb_zerocopy(skb, header, header_size, priv, zh); +} + /** * skb_cow - copy header of skb when it is required * @skb: buffer to cow diff --git a/include/net/sock.h b/include/net/sock.h --- a/include/net/sock.h +++ b/include/net/sock.h @@ -117,6 +117,20 @@ struct sock_common { struct proto *skc_prot; }; +enum zc_flags { + ZC_PAGE_READY = 0, +}; + +struct zc_page +{ + struct page *page; + struct page *cached_page; + unsigned int page_offset; + unsigned int size; + unsigned int used; + long flags; +}; + /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock @@ -251,6 +265,13 @@ struct sock { int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); + + void *(* zc_alloc_data)(unsigned int size, void *priv); + int (* zc_commit_data)(unsigned int size, void *priv); + wait_queue_head_t zc_data_ready; + spinlock_t zc_lock; + struct zc_page *zc_pages; + unsigned int zc_page_num, zc_page_index; }; /* diff --git a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1663,7 +1663,7 @@ EXPORT_SYMBOL(read_cache_page); * caller's lru-buffering pagevec. This function is specifically for * generic_file_write(). */ -static inline struct page * +struct page * __grab_cache_page(struct address_space *mapping, unsigned long index, struct page **cached_page, struct pagevec *lru_pvec) { @@ -1692,6 +1692,8 @@ repeat: return page; } +EXPORT_SYMBOL_GPL(__grab_cache_page); + /* * The logic we want is * diff --git a/net/core/Makefile b/net/core/Makefile --- a/net/core/Makefile +++ b/net/core/Makefile @@ -3,7 +3,7 @@ # obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ - gen_stats.o gen_estimator.o + gen_stats.o gen_estimator.o zerocopy.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o diff --git a/net/core/skbuff.c b/net/core/skbuff.c --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -182,6 +182,56 @@ nodata: goto out; } +void *zc_alloc_data(void *header, unsigned int header_size, unsigned int size, + void **priv, struct zc_handler **__zh, int *status); +void zc_commit_data(void *header, unsigned int header_size, unsigned int size, + void *priv, struct zc_handler *zh); + +struct sk_buff *__alloc_skb_zerocopy(unsigned int size, gfp_t gfp_mask, + void *header, unsigned int header_size, + void **priv, struct zc_handler **zh, int *status) +{ + struct sk_buff *skb; + u8 *data; + + *status = -1; + + skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask & ~__GFP_DMA); + if (!skb) + goto out; + + data = zc_alloc_data(header, header_size, size, priv, zh, status); + if (!data) + goto err_out_free_skb; + + memset(skb, 0, offsetof(struct sk_buff, truesize)); + skb->truesize = size + sizeof(struct sk_buff); + atomic_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb->tail = data; + skb->end = data + size; + + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->frag_list = NULL; +out: + return skb; +err_out_free_skb: + kmem_cache_free(skbuff_head_cache, skb); + skb = NULL; + goto out; +} + +void __kfree_skb_zerocopy(struct sk_buff *skb, + void *header, unsigned int header_size, void *priv, struct zc_handler *zh) +{ + zc_commit_data(header, header_size, skb->truesize - sizeof(struct sk_buff), priv, zh); + kmem_cache_free(skbuff_head_cache, skb); +} + /** * alloc_skb_from_cache - allocate a network buffer * @cp: kmem_cache from which to allocate the data area @@ -1739,3 +1789,5 @@ EXPORT_SYMBOL(skb_prepare_seq_read); EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); EXPORT_SYMBOL(skb_find_text); +EXPORT_SYMBOL(__alloc_skb_zerocopy); +EXPORT_SYMBOL(__kfree_skb_zerocopy); diff --git a/net/core/sock.c b/net/core/sock.c --- a/net/core/sock.c +++ b/net/core/sock.c @@ -704,6 +704,18 @@ void sk_free(struct sock *sk) module_put(owner); } +static void zc_sk_init(struct sock *sk) +{ + spin_lock_init(&sk->zc_lock); + init_waitqueue_head(&sk->zc_data_ready); + sk->zc_pages = NULL; + sk->zc_page_num = 0; + sk->zc_page_index = 0; + sk->zc_alloc_data = NULL; + sk->zc_commit_data = NULL; +} + + struct sock *sk_clone(const struct sock *sk, const gfp_t priority) { struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0); @@ -737,6 +749,8 @@ struct sock *sk_clone(const struct sock sock_reset_flag(newsk, SOCK_DONE); skb_queue_head_init(&newsk->sk_error_queue); + zc_sk_init(newsk); + filter = newsk->sk_filter; if (filter != NULL) sk_filter_charge(newsk, filter); @@ -1320,6 +1334,8 @@ void sock_init_data(struct socket *sock, sk->sk_stamp.tv_usec = -1L; atomic_set(&sk->sk_refcnt, 1); + + zc_sk_init(sk); } void fastcall lock_sock(struct sock *sk) diff --git a/net/core/zerocopy.c b/net/core/zerocopy.c new file mode 100644 --- /dev/null +++ b/net/core/zerocopy.c @@ -0,0 +1,135 @@ +/* + * zerocopy.c + * + * 2005 Copyright (c) Evgeniy Polyakov + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include + +#include + +static void *tcp_sendfile_alloc_data(struct zc_handler *zh, void *header, unsigned int header_size, unsigned int size, void **priv, int *status); +static void tcp_sendfile_commit_data(struct zc_handler *zh, void *header, unsigned int header_size, unsigned int size, void *priv); + +static struct zc_handler zc_tcp_sendfile_handler = { + .alloc_data = &tcp_sendfile_alloc_data, + .commit_data = &tcp_sendfile_commit_data, +}; + +static DEFINE_SPINLOCK(zc_lock); +static LIST_HEAD(zc_list); + +void *zc_alloc_data(void *header, unsigned int header_size, unsigned int size, void **priv, struct zc_handler **__zh, int *status) +{ + struct zc_handler *zh; + void *data = NULL; + + if (unlikely(size > PAGE_SIZE)) + return NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(zh, &zc_list, zc_entry) { + data = zh->alloc_data(zh, header, header_size, size, priv, status); + if (data) { + *__zh = zh; + break; + } + } + rcu_read_unlock(); + + return data; +} + +void zc_commit_data(void *header, unsigned int header_size, unsigned int size, void *priv, struct zc_handler *zh) +{ + if (zh) + zh->commit_data(zh, header, header_size, size, priv); +} + +int zc_add_handler(struct zc_handler *h) +{ + if (!h->alloc_data || !h->commit_data) + return -EINVAL; + + spin_lock(&zc_lock); + list_add_rcu(&h->zc_entry, &zc_list); + spin_unlock(&zc_lock); + + return 0; +} + +void zc_del_handler(struct zc_handler *h) +{ + spin_lock(&zc_lock); + list_del_rcu(&h->zc_entry); + spin_unlock(&zc_lock); + + synchronize_rcu(); +} + +extern struct inet_hashinfo __cacheline_aligned tcp_hashinfo; + +static void *tcp_sendfile_alloc_data(struct zc_handler *zh, void *header, unsigned int header_size, unsigned int size, void **priv, int *status) +{ + struct ethhdr *eth; + struct iphdr *iph; + struct tcphdr *tcph; + struct sock *sk; + void *data; + + if (header_size < sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct tcphdr)) + return NULL; + + eth = header; + iph = (struct iphdr *)(eth + 1); + + if (iph->protocol != IPPROTO_TCP) + return NULL; + + tcph = (struct tcphdr *)(iph + 1); + + /* + * I suspect it is not enough to disable BHs, + * since it can be [and is] called from hard IRQ context. + */ + sk = inet_lookup(&tcp_hashinfo, iph->saddr, tcph->source, + iph->daddr, ntohs(tcph->dest), + 0); /* must do something with bound devices */ + + if (sk->zc_alloc_data) { + *priv = sk; + data = sk->zc_alloc_data(size, sk); + *status = (data)?0:1; + } + + return NULL; +} + +static void tcp_sendfile_commit_data(struct zc_handler *zh, void *header, unsigned int header_size, unsigned int size, void *priv) +{ + struct sock *sk = priv; + + sk->zc_commit_data(size, sk); +} + +int __init zc_add_tcp(void) +{ + return zc_add_handler(&zc_tcp_sendfile_handler); +} diff --git a/net/socket.c b/net/socket.c --- a/net/socket.c +++ b/net/socket.c @@ -44,6 +44,7 @@ * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) * Tigran Aivazian : Made listen(2) backlog sanity checks * protocol-independent + * Evgeniy Polyakov: Added sock_sendfile(). * * * This program is free software; you can redistribute it and/or @@ -84,6 +85,10 @@ #include #include #include +#include +#include +#include +#include #ifdef CONFIG_NET_RADIO #include /* Note : will define WIRELESS_EXT */ @@ -116,6 +121,7 @@ static ssize_t sock_writev(struct file * unsigned long count, loff_t *ppos); static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more); +ssize_t sock_sendfile(struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void *target); /* @@ -136,7 +142,8 @@ static struct file_operations socket_fil .fasync = sock_fasync, .readv = sock_readv, .writev = sock_writev, - .sendpage = sock_sendpage + .sendpage = sock_sendpage, + .sendfile = sock_sendfile, }; /* @@ -726,6 +733,239 @@ static ssize_t sock_aio_write(struct kio return __sock_sendmsg(iocb, sock, &x->async_msg, size); } +void *zc_sock_alloc_data(unsigned int size, void *priv) +{ + struct sock *sk = priv; + void *data = NULL; + unsigned long flags; + struct zc_page *zp; + + if (!sk || !sk->zc_page_num) + goto out; + + spin_lock_irqsave(&sk->zc_lock, flags); + zp = &sk->zc_pages[sk->zc_page_index]; + if (zp->size == zp->used || test_bit(ZC_PAGE_READY, &zp->flags)) { + unsigned int index = sk->zc_page_index + 1; + + BUG_ON(index > sk->zc_page_num); + + if (index == sk->zc_page_num) + index = 0; + zp = &sk->zc_pages[index]; + if (zp->size == zp->used || test_bit(ZC_PAGE_READY, &zp->flags)) + goto out_unlock; + } + if (zp->size - zp->used < size) + goto out_unlock; + + data = page_address(zp->page) + zp->page_offset; + zp->used += size; + +out_unlock: + spin_unlock_irqrestore(&sk->zc_lock, flags); +out: + return data; +} + +int zc_sock_commit_data(unsigned int size, void *priv) +{ + struct sock *sk = priv; + unsigned long flags; + struct zc_page *zp; + + spin_lock_irqsave(&sk->zc_lock, flags); + + BUG_ON(sk->zc_page_index + 1 > sk->zc_page_num); + + zp = &sk->zc_pages[sk->zc_page_index]; + + if (unlikely(size != zp->size)) { + spin_unlock_irqrestore(&sk->zc_lock, flags); + return 1; + } + + + if (zp->used == zp->size) { + set_bit(ZC_PAGE_READY, &zp->flags); + if (++sk->zc_page_index == sk->zc_page_num) + sk->zc_page_index = 0; + } + spin_unlock_irqrestore(&sk->zc_lock, flags); + + wake_up(&sk->zc_data_ready); + + return 0; +} + +extern struct page * __grab_cache_page(struct address_space *mapping, unsigned long index, + struct page **cached_page, struct pagevec *lru_pvec); + +static int prepare_page(struct zc_page *zp, struct file *file, struct address_space *mapping, + loff_t *ppos, loff_t count, struct pagevec *lru_pvec) +{ + unsigned long index; + unsigned long page_offset; + unsigned long bytes; + struct address_space_operations *a_ops = mapping->a_ops; + loff_t pos_allocated = *ppos; + int err = 0; + + page_offset = (pos_allocated & (PAGE_CACHE_SIZE -1)); + index = pos_allocated >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - page_offset; + if (bytes > count) + bytes = count; + + zp->page = __grab_cache_page(mapping, index, &zp->cached_page, lru_pvec); + if (!zp->page) { + err = -ENOMEM; + goto err_out_exit; + } + + err = a_ops->prepare_write(file, zp->page, page_offset, page_offset+bytes); + if (unlikely(err)) { + unlock_page(zp->page); + page_cache_release(zp->page); + goto err_out_exit; + } + + zp->page_offset = page_offset; + zp->size = bytes; + zp->used = 0; + clear_bit(ZC_PAGE_READY, &zp->flags); + + pos_allocated += bytes; + + *ppos = pos_allocated; + +err_out_exit: + return err; +} + +ssize_t sock_sendfile(struct file *in_file, loff_t *ppos, size_t count, read_actor_t actor, void *target) +{ + struct socket *sock; + struct sock *sk; + int err = 0; + size_t written = 0; + struct file *file = target; + struct address_space *mapping = file->f_mapping; + struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + loff_t pos, pos_allocated; + struct pagevec lru_pvec; + unsigned long flags; + int pnum_max = 16, i; + unsigned int zc_page_index; + struct zc_page *zc_pages, *zp; + + if (!count) + return 0; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto err_out_exit; + + sock = SOCKET_I(in_file->f_dentry->d_inode); + + if (!sock || !sock->sk) { + err = -ENODEV; + goto err_out_exit; + } + sk = sock->sk; + + pnum_max = ((count >> PAGE_CACHE_SHIFT) > pnum_max)?pnum_max:(count >> PAGE_CACHE_SHIFT); + zc_pages = kmalloc(sizeof(struct zc_page) * pnum_max, GFP_KERNEL); + if (!zc_pages) { + err = -ENOMEM; + goto err_out_exit; + } + + pagevec_init(&lru_pvec, 0); + + pos = pos_allocated = *ppos; + + err = 0; + for (i=0; izc_lock, flags); + sk->zc_pages = zc_pages; + sk->zc_page_num = pnum_max; + sk->zc_page_index = zc_page_index; + sk->zc_alloc_data = &zc_sock_alloc_data; + sk->zc_commit_data = &zc_sock_commit_data; + spin_unlock_irqrestore(&sk->zc_lock, flags); + + while (count) { + struct zc_page *zp; + + interruptible_sleep_on(&sk->zc_data_ready); + + spin_lock_irqsave(&sk->zc_lock, flags); + if (zc_page_index == sk->zc_page_index) { + spin_unlock_irqrestore(&sk->zc_lock, flags); + continue; + } + spin_unlock_irqrestore(&sk->zc_lock, flags); + + for (i=0; iflags)) { + flush_dcache_page(zp->page); + err = a_ops->commit_write(file, zp->page, zp->page_offset, zp->page_offset+zp->used); + unlock_page(zp->page); + mark_page_accessed(zp->page); + page_cache_release(zp->page); + + if (err < 0) + goto err_out_release_all_pages; + + balance_dirty_pages_ratelimited(mapping); + + count -= zp->used; + written += zp->used; + pos += zp->used; + + zc_page_index++; + + err = prepare_page(zp, file, mapping, &pos_allocated, count, &lru_pvec); + } + } + } + + pagevec_lru_add(&lru_pvec); + + *ppos += written; + + return written; + +err_out_release_all_pages: + i = pnum_max; +err_out_release_pages: + for (--i; i>=0; --i) { + unlock_page(zc_pages[i].page); + page_cache_release(zc_pages[i].page); + if (zc_pages[i].cached_page) + page_cache_release(zc_pages[i].cached_page); + } + + kfree(zc_pages); + +err_out_exit: + + return err; +} + static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more) {