diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c --- a/drivers/net/8139too.c +++ b/drivers/net/8139too.c @@ -82,6 +82,8 @@ Robert Kuebel - Save kernel thread from dying on any signal. + Evgeniy Polyakov - Added receiving zero-copy support. + Submitting bug reports: "rtl8139-diag -mmmaaavvveefN" output @@ -91,7 +93,7 @@ #define DRV_NAME "8139too" #define DRV_VERSION "0.9.27" - +#define DRV_EXT "-zc" #include #include @@ -108,11 +110,16 @@ #include #include #include +#include +#include +#include +#include +#include #include #include #include -#define RTL8139_DRIVER_NAME DRV_NAME " Fast Ethernet driver " DRV_VERSION +#define RTL8139_DRIVER_NAME DRV_NAME " Fast Ethernet driver " DRV_VERSION DRV_EXT #define PFX DRV_NAME ": " /* Default Message level */ @@ -1895,16 +1902,23 @@ static void rtl8139_rx_err (u32 rx_statu } #if RX_BUF_IDX == 3 -static __inline__ void wrap_copy(struct sk_buff *skb, const unsigned char *ring, +static __inline__ void __wrap_copy(void *data, const unsigned char *ring, u32 offset, unsigned int size) { u32 left = RX_BUF_LEN - offset; if (size > left) { - memcpy(skb->data, ring + offset, left); - memcpy(skb->data+left, ring, size - left); + memcpy(data, ring + offset, left); + memcpy(data+left, ring, size - left); } else - memcpy(skb->data, ring + offset, size); + memcpy(data, ring + offset, size); + +} + +static __inline__ void wrap_copy(struct sk_buff *skb, const unsigned char *ring, + u32 offset, unsigned int size) +{ + __wrap_copy(skb->data, ring, offset, size); } #endif @@ -1926,6 +1940,108 @@ static void rtl8139_isr_ack(struct rtl81 } } +static void rtl8139_copy(void *dst, unsigned char *rx_ring, u32 ring_offset, int size) +{ + if (!size) + return; +#if RX_BUF_IDX == 3 + __wrap_copy(dst, rx_ring, ring_offset, size); +#else + memcpy(dst, &rx_ring[ring_offset], size); +#endif +} + +static int rtl8139_move_data(struct zc_buf *zb, unsigned int sz) +{ + struct rtl8139_private *tp = zb->priv_data; + unsigned char *rx_ring = tp->rx_ring; + unsigned int cur_rx = tp->cur_rx; + u32 ring_offset = cur_rx % RX_BUF_LEN; + struct sk_buff *skb = zb->skb; + skb_frag_t *frag; + void *dest; + + if (unlikely(skb_shinfo(skb)->nr_frags == 0) || unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) + return -EINVAL; + + frag = &skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags-1]; + dest = page_address(frag->page) + frag->page_offset; + + rtl8139_copy(dest, rx_ring, ring_offset + 4 + zb->header_size, sz); + + return sz; +} + +static int rtl8139_copy_header(struct zc_buf *zb) +{ + struct rtl8139_private *tp = zb->priv_data; + unsigned char *rx_ring = tp->rx_ring; + unsigned int cur_rx = tp->cur_rx; + u32 ring_offset = cur_rx % RX_BUF_LEN; + u8 *orig_ptr, *ptr = zb->header; + int tocopy, hsize = 0; + struct iphdr *iph; + struct tcphdr *tcph; + struct ethhdr *eth; + + orig_ptr = ptr; + + ring_offset += 4; + + tocopy = sizeof(struct ethhdr); + rtl8139_copy(ptr, rx_ring, ring_offset, tocopy); + ptr += tocopy; + ring_offset += tocopy; + + eth = (struct ethhdr *)(ptr - sizeof(struct ethhdr)); + if (eth->h_proto != htons(ETH_P_IP)) + return -1; + + tocopy = sizeof(struct iphdr); + rtl8139_copy(ptr, rx_ring, ring_offset, tocopy); + ptr += tocopy; + ring_offset += tocopy; + + iph = (struct iphdr *)(ptr - sizeof(struct iphdr)); + if (iph->protocol == IPPROTO_TCP) { + hsize = sizeof(struct tcphdr); + } else if (iph->protocol == IPPROTO_UDP) { + hsize = sizeof(struct udphdr); + } else + return -1; + + tocopy = iph->ihl*4 - sizeof(struct iphdr) + hsize; + if (tocopy + ptr - orig_ptr > zb->header_size) + return -1; + rtl8139_copy(ptr, rx_ring, ring_offset, tocopy); + ptr += tocopy; + ring_offset += tocopy; + + if (iph->protocol == IPPROTO_TCP) { + tcph = (struct tcphdr *)(((u8 *)(iph)) + iph->ihl*4); + tocopy = tcph->doff*4; + if (tocopy + ptr - orig_ptr > zb->header_size) + return -1; + rtl8139_copy(ptr, rx_ring, ring_offset, tocopy); + ptr += tocopy; + ring_offset += tocopy; + } + + zb->header_size = ptr - orig_ptr; + zb->size -= zb->header_size; + + return 0; +} + +static void rtl8139_work_func(void *data) +{ + struct sk_buff *skb = data; + + netif_receive_skb(skb); +} + +static DECLARE_WORK(rtl8139_work, &rtl8139_work_func, NULL); + static int rtl8139_rx(struct net_device *dev, struct rtl8139_private *tp, int budget) { @@ -1956,8 +2072,7 @@ static int rtl8139_rx(struct net_device if (netif_msg_rx_status(tp)) printk(KERN_DEBUG "%s: rtl8139_rx() status %4.4x, size %4.4x," - " cur %4.4x.\n", dev->name, rx_status, - rx_size, cur_rx); + " cur %4.4x.\n", dev->name, rx_status, rx_size, cur_rx); #if RTL8139_DEBUG > 2 { int i; @@ -2005,34 +2120,77 @@ no_early_rx: goto out; } - /* Malloc up new buffer, compatible with net-2e. */ - /* Omit the four octet CRC from the length. */ + { + u8 zc_data[256]; + struct zc_buf *zb; + + memset(&zc_data, 0, sizeof(zc_data)); + zb = (struct zc_buf *)zc_data; - skb = dev_alloc_skb (pkt_size + 2); - if (likely(skb)) { - skb->dev = dev; - skb_reserve (skb, 2); /* 16 byte align the IP fields. */ + zb->header = (void *)(zb + 1); + zb->header_size = sizeof(zc_data) - sizeof(struct zc_buf); + zb->size = pkt_size; + zb->priv_data = tp; + zb->move_data = &rtl8139_move_data; + + if (!rtl8139_copy_header(zb)) { + skb = alloc_skb_zerocopy(zb, GFP_ATOMIC); + if (skb) { + skb->dev = dev; + skb->protocol = eth_type_trans(skb, dev); + + dev->last_rx = jiffies; + tp->stats.rx_bytes += pkt_size; + tp->stats.rx_packets++; + netif_receive_skb(skb); + //rtl8139_work.data = skb; + //schedule_work(&rtl8139_work); + } + } else { + skb = NULL; + zb->status = -1; + } + + if (!skb) { + if (zb->status == -1) { + /* Malloc up new buffer, compatible with net-2e. */ + /* Omit the four octet CRC from the length. */ + + skb = dev_alloc_skb (pkt_size + 2); + if (likely(skb)) { + skb->dev = dev; + skb_reserve (skb, 2); /* 16 byte align the IP fields. */ #if RX_BUF_IDX == 3 - wrap_copy(skb, rx_ring, ring_offset+4, pkt_size); + wrap_copy(skb, rx_ring, ring_offset+4, pkt_size); #else - eth_copy_and_sum (skb, &rx_ring[ring_offset + 4], pkt_size, 0); + eth_copy_and_sum (skb, &rx_ring[ring_offset + 4], pkt_size, 0); #endif - skb_put (skb, pkt_size); - - skb->protocol = eth_type_trans (skb, dev); + skb_put (skb, pkt_size); - dev->last_rx = jiffies; - tp->stats.rx_bytes += pkt_size; - tp->stats.rx_packets++; + skb->protocol = eth_type_trans (skb, dev); - netif_receive_skb (skb); - } else { - if (net_ratelimit()) - printk (KERN_WARNING - "%s: Memory squeeze, dropping packet.\n", - dev->name); - tp->stats.rx_dropped++; + dev->last_rx = jiffies; + tp->stats.rx_bytes += pkt_size; + tp->stats.rx_packets++; + + netif_receive_skb(skb); + } else { + if (net_ratelimit()) + printk (KERN_WARNING + "%s: Memory squeeze, dropping packet.\n", + dev->name); + tp->stats.rx_dropped++; + } + } else { + if (net_ratelimit()) + printk (KERN_WARNING + "%s: Zero-copy failed, dropping packet.\n", + dev->name); + tp->stats.rx_dropped++; + } + } } + received++; cur_rx = (cur_rx + rx_size + 4 + 3) & ~3; diff --git a/fs/read_write.c b/fs/read_write.c --- a/fs/read_write.c +++ b/fs/read_write.c @@ -15,6 +15,8 @@ #include #include +#include + #include #include @@ -670,8 +672,15 @@ static ssize_t do_sendfile(int out_fd, i if (!(out_file->f_mode & FMODE_WRITE)) goto fput_out; retval = -EINVAL; - if (!out_file->f_op || !out_file->f_op->sendpage) + if (!out_file->f_op) + goto fput_out; + + if (!SOCKET_I(in_file->f_dentry->d_inode) && !out_file->f_op->sendpage) { + printk("%s: sock=%p, sendpage=%p.\n", __func__, + SOCKET_I(in_file->f_dentry->d_inode), out_file->f_op->sendpage); goto fput_out; + } + out_inode = out_file->f_dentry->d_inode; retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); if (retval) @@ -688,7 +697,7 @@ static ssize_t do_sendfile(int out_fd, i retval = -EINVAL; if (unlikely(pos < 0)) goto fput_out; - if (unlikely(pos + count > max)) { + if (unlikely((unsigned long long)(pos + count) > (unsigned long long)max)) { retval = -EOVERFLOW; if (pos >= max) goto fput_out; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -34,6 +34,8 @@ #define HAVE_ALIGNABLE_SKB /* Ditto 8) */ #define SLAB_SKB /* Slabified skbuffs */ +#define ZEROCOPY_HEADER_CACHE_SIZE 256 /* Maximum receiving zero-copy header size */ + #define CHECKSUM_NONE 0 #define CHECKSUM_HW 1 #define CHECKSUM_UNNECESSARY 2 @@ -261,7 +263,8 @@ struct sk_buff { nohdr:1, nfctinfo:3; __u8 pkt_type:3, - fclone:2; + fclone:2, + zerocopy:1; __be16 protocol; void (*destructor)(struct sk_buff *skb); @@ -1045,6 +1048,36 @@ static inline struct sk_buff *dev_alloc_ return __dev_alloc_skb(length, GFP_ATOMIC); } +struct zc_buf; + +struct zc_handler +{ + struct list_head zc_entry; + int (* alloc_data)(struct zc_buf *zb); + int (* commit_data)(struct zc_buf *zb); +}; + +struct zc_buf +{ + struct zc_handler *zh; + void *header; + unsigned int header_size; + unsigned int size; + void *priv; + int status; + struct sk_buff *skb; + int (* move_data)(struct zc_buf *zb, unsigned int sz); + void *priv_data; +}; + + +extern struct sk_buff *__alloc_skb_zerocopy(struct zc_buf *zb, gfp_t gfp_mask); + +static inline struct sk_buff *alloc_skb_zerocopy(struct zc_buf *zb, gfp_t gfp_mask) +{ + return __alloc_skb_zerocopy(zb, gfp_mask); +} + /** * skb_cow - copy header of skb when it is required * @skb: buffer to cow diff --git a/include/net/sock.h b/include/net/sock.h --- a/include/net/sock.h +++ b/include/net/sock.h @@ -117,6 +117,20 @@ struct sock_common { struct proto *skc_prot; }; +enum zc_flags { + ZC_PAGE_READY = 0, +}; + +struct zc_page +{ + struct page *page; + struct page *cached_page; + unsigned int page_offset; + unsigned int size; + unsigned int used; + long flags; +}; + /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock @@ -251,6 +265,14 @@ struct sock { int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); + + int (* zc_alloc_data)(struct zc_buf *zb); + int (* zc_commit_data)(struct zc_buf *zb); + wait_queue_head_t zc_data_ready; + spinlock_t zc_lock; + struct zc_page *zc_pages; + unsigned int zc_page_num, zc_page_index; + unsigned int zc_users; }; /* diff --git a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1663,7 +1663,7 @@ EXPORT_SYMBOL(read_cache_page); * caller's lru-buffering pagevec. This function is specifically for * generic_file_write(). */ -static inline struct page * +struct page * __grab_cache_page(struct address_space *mapping, unsigned long index, struct page **cached_page, struct pagevec *lru_pvec) { @@ -1692,6 +1692,8 @@ repeat: return page; } +EXPORT_SYMBOL_GPL(__grab_cache_page); + /* * The logic we want is * diff --git a/net/core/Makefile b/net/core/Makefile --- a/net/core/Makefile +++ b/net/core/Makefile @@ -3,7 +3,7 @@ # obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ - gen_stats.o gen_estimator.o + gen_stats.o gen_estimator.o zerocopy.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o diff --git a/net/core/datagram.c b/net/core/datagram.c --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -213,6 +213,10 @@ int skb_copy_datagram_iovec(const struct { int i, err, fraglen, end = 0; struct sk_buff *next = skb_shinfo(skb)->frag_list; + + if (skb->zerocopy) + return 0; + next_skb: fraglen = skb_headlen(skb); i = -1; @@ -364,6 +368,9 @@ int skb_copy_and_csum_datagram_iovec(con { unsigned int csum; int chunk = skb->len - hlen; + + if (skb->zerocopy) + return 0; /* Skip filled elements. * Pretty silly, look at memcpy_toiovec, though 8) diff --git a/net/core/skbuff.c b/net/core/skbuff.c --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -70,6 +70,7 @@ static kmem_cache_t *skbuff_head_cache __read_mostly; static kmem_cache_t *skbuff_fclone_cache __read_mostly; +static kmem_cache_t *skbuff_head_cache_zerocopy __read_mostly; /* * Keep out-of-line to prevent kernel bloat. @@ -182,6 +183,64 @@ nodata: goto out; } +int zc_alloc_data(struct zc_buf *zb); +int zc_commit_data(struct zc_buf *zb); + +struct sk_buff *__alloc_skb_zerocopy(struct zc_buf *zb, gfp_t gfp_mask) +{ + struct sk_buff *skb = NULL; + void *data; + int err; + unsigned int size = SKB_DATA_ALIGN(zb->header_size); + + if (size > ZEROCOPY_HEADER_CACHE_SIZE) + goto out; + + zb->status = -1; + + skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask & ~__GFP_DMA); + if (!skb) + goto out; + + data = kmem_cache_alloc(skbuff_head_cache_zerocopy, gfp_mask & ~__GFP_DMA); + if (!data) + goto err_out_free_skb; + + memset(skb, 0, offsetof(struct sk_buff, truesize)); + skb->truesize = size + sizeof(struct sk_buff); + atomic_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb->tail = data; + skb->end = data + size; + + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->frag_list = NULL; + + skb->zerocopy = 1; + /* It could be zerocopied too, but let's use it as is for now. --zbr 2005_10_27 */ + memcpy(skb->data, zb->header, zb->header_size); + skb_put(skb, zb->header_size); + + zb->skb = skb; + + err = zc_alloc_data(zb); + if (err) + goto err_out_free_skb_data; + +out: + return skb; +err_out_free_skb_data: + kmem_cache_free(skbuff_head_cache_zerocopy, data); +err_out_free_skb: + kmem_cache_free(skbuff_head_cache, skb); + skb = NULL; + goto out; +} + /** * alloc_skb_from_cache - allocate a network buffer * @cp: kmem_cache from which to allocate the data area @@ -284,7 +343,10 @@ void kfree_skbmem(struct sk_buff *skb) struct sk_buff *other; atomic_t *fclone_ref; - skb_release_data(skb); + if (skb->zerocopy) + kmem_cache_free(skbuff_head_cache_zerocopy, skb->head); + else + skb_release_data(skb); switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb); @@ -1706,6 +1768,14 @@ void __init skb_init(void) NULL, NULL); if (!skbuff_fclone_cache) panic("cannot create skbuff cache"); + + skbuff_head_cache_zerocopy = kmem_cache_create("skbuff_head_cache_zerocopy", + ZEROCOPY_HEADER_CACHE_SIZE + sizeof(struct skb_shared_info), + 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!skbuff_head_cache_zerocopy) + panic("cannot create zerocopy skbuff cache"); } EXPORT_SYMBOL(___pskb_trim); @@ -1739,3 +1809,4 @@ EXPORT_SYMBOL(skb_prepare_seq_read); EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); EXPORT_SYMBOL(skb_find_text); +EXPORT_SYMBOL(__alloc_skb_zerocopy); diff --git a/net/core/sock.c b/net/core/sock.c --- a/net/core/sock.c +++ b/net/core/sock.c @@ -704,6 +704,18 @@ void sk_free(struct sock *sk) module_put(owner); } +static void zc_sk_init(struct sock *sk) +{ + spin_lock_init(&sk->zc_lock); + init_waitqueue_head(&sk->zc_data_ready); + sk->zc_pages = NULL; + sk->zc_page_num = 0; + sk->zc_page_index = 0; + sk->zc_alloc_data = NULL; + sk->zc_commit_data = NULL; +} + + struct sock *sk_clone(const struct sock *sk, const gfp_t priority) { struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0); @@ -737,6 +749,8 @@ struct sock *sk_clone(const struct sock sock_reset_flag(newsk, SOCK_DONE); skb_queue_head_init(&newsk->sk_error_queue); + zc_sk_init(newsk); + filter = newsk->sk_filter; if (filter != NULL) sk_filter_charge(newsk, filter); @@ -1320,6 +1334,8 @@ void sock_init_data(struct socket *sock, sk->sk_stamp.tv_usec = -1L; atomic_set(&sk->sk_refcnt, 1); + + zc_sk_init(sk); } void fastcall lock_sock(struct sock *sk) diff --git a/net/core/zerocopy.c b/net/core/zerocopy.c new file mode 100644 --- /dev/null +++ b/net/core/zerocopy.c @@ -0,0 +1,165 @@ +/* + * zerocopy.c + * + * 2005 Copyright (c) Evgeniy Polyakov + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include + +#include + +static int tcp_sendfile_alloc_data(struct zc_buf *zb); +static int tcp_sendfile_commit_data(struct zc_buf *zb); + +static struct zc_handler zc_tcp_sendfile_handler = { + .alloc_data = &tcp_sendfile_alloc_data, + .commit_data = &tcp_sendfile_commit_data, +}; + +static DEFINE_SPINLOCK(zc_lock); +static LIST_HEAD(zc_list); + +int zc_alloc_data(struct zc_buf *zb) +{ + struct zc_handler *zh; + int err = -ENODEV; + + if (unlikely(zb->size > PAGE_SIZE)) + return err; + + rcu_read_lock(); + list_for_each_entry_rcu(zh, &zc_list, zc_entry) { + err = zh->alloc_data(zb); + if (!err) { + zb->zh = zh; + break; + } + } + rcu_read_unlock(); + + return err; +} + +int zc_commit_data(struct zc_buf *zb) +{ + int err = -EINVAL; + + if (zb->zh) + err = zb->zh->commit_data(zb); + + return err; +} + +int zc_add_handler(struct zc_handler *h) +{ + if (!h->alloc_data || !h->commit_data) + return -EINVAL; + + spin_lock(&zc_lock); + list_add_rcu(&h->zc_entry, &zc_list); + spin_unlock(&zc_lock); + + return 0; +} + +void zc_del_handler(struct zc_handler *h) +{ + spin_lock(&zc_lock); + list_del_rcu(&h->zc_entry); + spin_unlock(&zc_lock); + + synchronize_rcu(); +} + +extern struct inet_hashinfo __cacheline_aligned tcp_hashinfo; + +static int tcp_sendfile_alloc_data(struct zc_buf *zb) +{ + struct ethhdr *eth; + struct iphdr *iph; + struct tcphdr *tcph; + struct sock *sk; + int dif, err = -EINVAL; + u32 saddr, daddr; + u16 sport, dport; + + if (zb->header_size < sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct tcphdr)) + goto err_out_exit; + + eth = zb->header; + + if (eth->h_proto != htons(ETH_P_IP)) + goto err_out_exit; + + iph = (struct iphdr *)(eth + 1); + + if (iph->protocol != IPPROTO_TCP) + goto err_out_exit; + + tcph = (struct tcphdr *)(iph + 1); + + dif = 0; + + saddr = iph->saddr; + sport = tcph->source; + daddr = iph->daddr; + dport = tcph->dest; + + /* + * I suspect it is not enough to disable BHs, + * since it can be [and is] called from hard IRQ context. + * Must do something with bound devices. + */ + sk = inet_lookup(&tcp_hashinfo, saddr, sport, daddr, dport, dif); + + if (sk && sk->zc_alloc_data) { + zb->priv = sk; + err = sk->zc_alloc_data(zb); + zb->status = (err)?1:0; + } +#if 0 + printk("%s: sk=%p, %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u, data=%p, status=%d, err=%d.\n", + __func__, sk, + NIPQUAD(saddr), ntohs(sport), + NIPQUAD(daddr), ntohs(dport), + zb->skb->data, zb->status, err); +#endif +err_out_exit: + return err; +} + +static int tcp_sendfile_commit_data(struct zc_buf *zb) +{ + struct sock *sk = zb->priv; + int err; + + err = sk->zc_commit_data(zb); + + printk("%s: commiting data, sk=%p, size=%4u, err=%d.\n", __func__, sk, zb->size, err); + + return err; +} + +int __init zc_add_tcp(void) +{ + return zc_add_handler(&zc_tcp_sendfile_handler); +} + +late_initcall(zc_add_tcp); diff --git a/net/socket.c b/net/socket.c --- a/net/socket.c +++ b/net/socket.c @@ -44,6 +44,7 @@ * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) * Tigran Aivazian : Made listen(2) backlog sanity checks * protocol-independent + * Evgeniy Polyakov: Added sock_sendfile(). * * * This program is free software; you can redistribute it and/or @@ -84,6 +85,10 @@ #include #include #include +#include +#include +#include +#include #ifdef CONFIG_NET_RADIO #include /* Note : will define WIRELESS_EXT */ @@ -116,6 +121,7 @@ static ssize_t sock_writev(struct file * unsigned long count, loff_t *ppos); static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more); +ssize_t sock_sendfile(struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void *target); /* @@ -136,7 +142,8 @@ static struct file_operations socket_fil .fasync = sock_fasync, .readv = sock_readv, .writev = sock_writev, - .sendpage = sock_sendpage + .sendpage = sock_sendpage, + .sendfile = sock_sendfile, }; /* @@ -726,6 +733,395 @@ static ssize_t sock_aio_write(struct kio return __sock_sendmsg(iocb, sock, &x->async_msg, size); } +int zc_sock_alloc_data(struct zc_buf *zb) +{ + struct sock *sk = zb->priv; + unsigned long flags; + struct zc_page *zp; + int err = -ENODEV, need_wakeup = 0; + unsigned int towrite = zb->size; + struct sk_buff *skb = zb->skb; + + if (!sk || !sk->zc_page_num || !skb) + goto out; + + spin_lock_irqsave(&sk->zc_lock, flags); + if (!sk->zc_pages) + goto out_unlock; + + BUG_ON(sk->zc_page_index + 1 > sk->zc_page_num); + + need_wakeup = 1; + while (towrite) { + zp = &sk->zc_pages[sk->zc_page_index]; + if (zp->size == zp->used || test_bit(ZC_PAGE_READY, &zp->flags)) { + set_bit(ZC_PAGE_READY, &zp->flags); + + if (++sk->zc_page_index == sk->zc_page_num) + sk->zc_page_index = 0; + + zp = &sk->zc_pages[sk->zc_page_index]; + if (zp->size == zp->used || test_bit(ZC_PAGE_READY, &zp->flags)) + break; + } + if (zp->size - zp->used < towrite && !zb->move_data) + break; + + if (skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS) { + err = -ENOMEM; + break; + } + /* + * Setup fragment with offset to point to the area where + * we actually can write without overwriting old data. + * Setup fragment size to be equal not to the real data size, + * but size of the area where we actually can write data into. + */ + skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, zp->page, zp->page_offset+zp->used, zp->size-zp->used); + + printk("%s: [%1d.%1d] data=%p [%p], size=%4u, used=%4u, towrite=%4u, users=%u.\n", + __func__, sk->zc_page_index, skb_shinfo(skb)->nr_frags-1, + zp->page, page_address(zp->page) + zp->page_offset, + zp->size, zp->used, towrite, sk->zc_users); + + if (zb->move_data) { + unsigned int sz = min(zp->size - zp->used, towrite); + + err = zb->move_data(zb, sz); + if (err <= 0) + break; + + if (zp->used + err == zp->size) { + printk("%s: [%1d.%1d] data=%p [%p], size=%4u, used=%4u, towrite=%4u, users=%u, page is ready.\n", + __func__, sk->zc_page_index, skb_shinfo(skb)->nr_frags-1, + zp->page, page_address(zp->page) + zp->page_offset, + zp->size, zp->used, towrite, sk->zc_users); + set_bit(ZC_PAGE_READY, &zp->flags); + if (++sk->zc_page_index == sk->zc_page_num) + sk->zc_page_index = 0; + } + } else + err = zb->size; + + skb->len += err; + skb->data_len += err; + skb->truesize += err; + + towrite -= err; + zp->used += err; + + err = 0; + } + + if (!err) + sk->zc_users += skb->data_len; + +out_unlock: + spin_unlock_irqrestore(&sk->zc_lock, flags); +out: + if (need_wakeup) + wake_up(&sk->zc_data_ready); + return err; +} + +int zc_sock_commit_data(struct zc_buf *zb) +{ + struct sock *sk = zb->priv; + unsigned long flags; + struct zc_page *zp; + + spin_lock_irqsave(&sk->zc_lock, flags); + + BUG_ON(sk->zc_page_index + 1 > sk->zc_page_num); + + zp = &sk->zc_pages[sk->zc_page_index]; + + if (unlikely(zb->size != zp->size)) { + spin_unlock_irqrestore(&sk->zc_lock, flags); + return 1; + } + + if (zp->used == zp->size) { + set_bit(ZC_PAGE_READY, &zp->flags); + if (++sk->zc_page_index == sk->zc_page_num) + sk->zc_page_index = 0; + } + spin_unlock_irqrestore(&sk->zc_lock, flags); + + wake_up(&sk->zc_data_ready); + + return 0; +} + +extern struct page * __grab_cache_page(struct address_space *mapping, unsigned long index, + struct page **cached_page, struct pagevec *lru_pvec); + +static int commit_page(struct zc_page *zp, struct file *file, struct address_space *mapping) +{ + int err; + struct address_space_operations *a_ops = mapping->a_ops; + + flush_dcache_page(zp->page); + err = a_ops->commit_write(file, zp->page, zp->page_offset, zp->page_offset+zp->used); + unlock_page(zp->page); + mark_page_accessed(zp->page); + page_cache_release(zp->page); + if (zp->cached_page) { + page_cache_release(zp->cached_page); + zp->cached_page = NULL; + } + + printk("%s: zp=%p, page=%p, page_offset=%u, used=%u, size=%u has been committed: err=%d.\n", + __func__, zp, zp->page, zp->page_offset, zp->used, zp->size, err); + + if (err < 0) + goto err_out_exit; + + balance_dirty_pages_ratelimited(mapping); + +err_out_exit: + return err; +} + +static int prepare_page(struct zc_page *zp, struct file *file, struct address_space *mapping, + loff_t *ppos, loff_t count, struct pagevec *lru_pvec) +{ + unsigned long index; + unsigned long page_offset; + unsigned long bytes; + struct address_space_operations *a_ops = mapping->a_ops; + loff_t pos_allocated = *ppos; + int err = 0; + + page_offset = (pos_allocated & (PAGE_CACHE_SIZE -1)); + index = pos_allocated >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - page_offset; + if (bytes > count) + bytes = count; + + zp->page = __grab_cache_page(mapping, index, &zp->cached_page, lru_pvec); + if (!zp->page) { + err = -ENOMEM; + goto err_out_exit; + } + + err = a_ops->prepare_write(file, zp->page, page_offset, page_offset+bytes); + if (unlikely(err)) { + unlock_page(zp->page); + page_cache_release(zp->page); + goto err_out_exit; + } + + zp->page_offset = page_offset; + zp->size = bytes; + zp->used = 0; + clear_bit(ZC_PAGE_READY, &zp->flags); + + printk("%s: zp=%p, page=%p, page_offset=%u, used=%u, size=%u has been prepared: err=%d.\n", + __func__, zp, zp->page, zp->page_offset, zp->used, zp->size, err); + + pos_allocated += bytes; + + *ppos = pos_allocated; + +err_out_exit: + return err; +} + +/* + * This should process all socket's related stuff, + * for example emit TCP ACKs... + * Since zero-copy skb can only have valid header, + * this should process that header at skb->data. + * skb_copy_datagram_iovec() is changed to not even touch + * zero-copied skb. + */ + +static u8 message_buf[PAGE_SIZE]; + +int receive_message(struct socket *sock, unsigned int ack_size) +{ + struct msghdr msg; + struct kvec iov; + int err; + + sock->sk->sk_allocation |= GFP_NOIO; + iov.iov_base = message_buf; + iov.iov_len = min(ack_size, sizeof(message_buf)); + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT; + + err = kernel_recvmsg(sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); + + printk("%s: kernel_recvmsg returned %d, ack_size=%u.\n", __func__, err, ack_size); + + return err; +} + +ssize_t sock_sendfile(struct file *in_file, loff_t *ppos, size_t count, read_actor_t actor, void *target) +{ + struct socket *sock; + struct sock *sk; + int err = 0; + size_t written = 0; + struct file *file = target; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + loff_t pos, pos_allocated; + struct pagevec lru_pvec; + unsigned long flags; + int pnum_max = 16, i; + unsigned int zc_page_index, ack_size; + struct zc_page *zc_pages, *zp; + + if (!count) + return 0; + + pos = pos_allocated = *ppos; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto err_out_exit; + + sock = SOCKET_I(in_file->f_dentry->d_inode); + + if (!sock || !sock->sk) { + err = -ENODEV; + goto err_out_exit; + } + sk = sock->sk; + + pnum_max = ((count >> PAGE_CACHE_SHIFT) > pnum_max)?pnum_max:(count >> PAGE_CACHE_SHIFT); + zc_pages = kzalloc(sizeof(struct zc_page) * pnum_max, GFP_KERNEL); + if (!zc_pages) { + err = -ENOMEM; + goto err_out_exit; + } + + pagevec_init(&lru_pvec, 0); + + err = 0; + for (i=0; izc_lock, flags); + sk->zc_pages = zc_pages; + sk->zc_page_num = pnum_max; + sk->zc_page_index = zc_page_index; + sk->zc_alloc_data = &zc_sock_alloc_data; + sk->zc_commit_data = &zc_sock_commit_data; + spin_unlock_irqrestore(&sk->zc_lock, flags); + + printk("%s: sk=%p, %d pages have been set up.\n", __func__, sk, pnum_max); + + while (count) { + struct zc_page *zp; + + interruptible_sleep_on_timeout(&sk->zc_data_ready, 5*HZ); + + printk("%s: wakeup: zc_page_index=%d, sk->zc_page_index=%d, sk->sk_state=%d.\n", + __func__, zc_page_index, sk->zc_page_index, sk->sk_state); + + ack_size = 0; + for (i=0; iflags)) { + printk("%s: checking page %p [%d]: page=%p, flags=%08lx, page_offset=%08x, size=%08x, used=%08x, written=%zx.\n", + __func__, zp, i, zp->page, zp->flags, zp->page_offset, zp->size, zp->used, written); + + err = commit_page(zp, file, mapping); + if (err) + goto err_out_release_all_pages; + + count -= zp->used; + written += zp->used; + pos += zp->used; + + if (++zc_page_index >= pnum_max) + zc_page_index = 0; + + ack_size += zp->used; + + err = prepare_page(zp, file, mapping, &pos_allocated, count, &lru_pvec); + } + } + + while (ack_size) { + err = receive_message(sock, ack_size); + if (err > 0) { + spin_lock_irqsave(&sk->zc_lock, flags); + sk->zc_users -= err; + spin_unlock_irqrestore(&sk->zc_lock, flags); + ack_size -= err; + } else + break; + } + + if (signal_pending(current)) + break; + } + + pagevec_lru_add(&lru_pvec); + + *ppos = pos; + err = written; + +err_out_release_all_pages: + i = pnum_max; +err_out_release_pages: + spin_lock_irqsave(&sk->zc_lock, flags); + sk->zc_pages = NULL; + sk->zc_page_num = 0; + sk->zc_page_index = 0; + sk->zc_alloc_data = NULL; + sk->zc_commit_data = NULL; + spin_unlock_irqrestore(&sk->zc_lock, flags); + + /* + * No new skbs can contribute data into VFS cache after this + * condition, so we only must care about those which are + * in socket queue already or will be inserted there after + * allocation, but allocation itself will always fail + * due to above locked changes. + */ + + for (--i; i>=0; --i) + commit_page(&zc_pages[i], file, mapping); + + while (sk->zc_users) { + struct sk_buff *skb; + + interruptible_sleep_on_timeout(&sk->zc_data_ready, 5*HZ); + + printk("%s: going to flush receive queue: sk->zc_users=%u.\n", + __func__, sk->zc_users); + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + sk->zc_users -= skb->data_len; + kfree_skb(skb); + } + } + + printk("%s: flushed: sk->zc_users=%u.\n", __func__, sk->zc_users); + + kfree(zc_pages); + +err_out_exit: + + return err; +} + static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more) {