mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-18 22:14:16 +00:00 
			
		
		
		
	 a74f0fa082
			
		
	
	
		a74f0fa082
		
	
	
	
	
		
			
			TCP_NOTSENT_LOWAT socket option or sysctl was added in linux-3.12 as a step to enable bigger tcp sndbuf limits. It works reasonably well, but the following happens : Once the limit is reached, TCP stack generates an [E]POLLOUT event for every incoming ACK packet. This causes a high number of context switches. This patch implements the strategy David Miller added in sock_def_write_space() : - If TCP socket has a notsent_lowat constraint of X bytes, allow sendmsg() to fill up to X bytes, but send [E]POLLOUT only if number of notsent bytes is below X/2 This considerably reduces TCP_NOTSENT_LOWAT overhead, while allowing to keep the pipe full. Tested: 100 ms RTT netem testbed between A and B, 100 concurrent TCP_STREAM A:/# cat /proc/sys/net/ipv4/tcp_wmem 4096 262144 64000000 A:/# super_netperf 100 -H B -l 1000 -- -K bbr & A:/# grep TCP /proc/net/sockstat TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 1364904 # This is about 54 MB of memory per flow :/ A:/# vmstat 5 5 procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 0 0 0 256220672 13532 694976 0 0 10 0 28 14 0 1 99 0 0 2 0 0 256320016 13532 698480 0 0 512 0 715901 5927 0 10 90 0 0 0 0 0 256197232 13532 700992 0 0 735 13 771161 5849 0 11 89 0 0 1 0 0 256233824 13532 703320 0 0 512 23 719650 6635 0 11 89 0 0 2 0 0 256226880 13532 705780 0 0 642 4 775650 6009 0 12 88 0 0 A:/# echo 2097152 >/proc/sys/net/ipv4/tcp_notsent_lowat A:/# grep TCP /proc/net/sockstat TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 86411 # 3.5 MB per flow A:/# vmstat 5 5 # check that context switches have not inflated too much. procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 2 0 0 260386512 13592 662148 0 0 10 0 17 14 0 1 99 0 0 0 0 0 260519680 13592 604184 0 0 512 13 726843 12424 0 10 90 0 0 1 1 0 260435424 13592 598360 0 0 512 25 764645 12925 0 10 90 0 0 1 0 0 260855392 13592 578380 0 0 512 7 722943 13624 0 11 88 0 0 1 0 0 260445008 13592 601176 0 0 614 34 772288 14317 0 10 90 0 0 Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
		
			
				
	
	
		
			213 lines
		
	
	
	
		
			5.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			213 lines
		
	
	
	
		
			5.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0
 | |
| /*
 | |
|  *     SUCS NET3:
 | |
|  *
 | |
|  *     Generic stream handling routines. These are generic for most
 | |
|  *     protocols. Even IP. Tonight 8-).
 | |
|  *     This is used because TCP, LLC (others too) layer all have mostly
 | |
|  *     identical sendmsg() and recvmsg() code.
 | |
|  *     So we (will) share it here.
 | |
|  *
 | |
|  *     Authors:        Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 | |
|  *                     (from old tcp.c code)
 | |
|  *                     Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-))
 | |
|  */
 | |
| 
 | |
| #include <linux/module.h>
 | |
| #include <linux/sched/signal.h>
 | |
| #include <linux/net.h>
 | |
| #include <linux/signal.h>
 | |
| #include <linux/tcp.h>
 | |
| #include <linux/wait.h>
 | |
| #include <net/sock.h>
 | |
| 
 | |
| /**
 | |
|  * sk_stream_write_space - stream socket write_space callback.
 | |
|  * @sk: socket
 | |
|  *
 | |
|  * FIXME: write proper description
 | |
|  */
 | |
| void sk_stream_write_space(struct sock *sk)
 | |
| {
 | |
| 	struct socket *sock = sk->sk_socket;
 | |
| 	struct socket_wq *wq;
 | |
| 
 | |
| 	if (__sk_stream_is_writeable(sk, 1) && sock) {
 | |
| 		clear_bit(SOCK_NOSPACE, &sock->flags);
 | |
| 
 | |
| 		rcu_read_lock();
 | |
| 		wq = rcu_dereference(sk->sk_wq);
 | |
| 		if (skwq_has_sleeper(wq))
 | |
| 			wake_up_interruptible_poll(&wq->wait, EPOLLOUT |
 | |
| 						EPOLLWRNORM | EPOLLWRBAND);
 | |
| 		if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 | |
| 			sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
 | |
| 		rcu_read_unlock();
 | |
| 	}
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * sk_stream_wait_connect - Wait for a socket to get into the connected state
 | |
|  * @sk: sock to wait on
 | |
|  * @timeo_p: for how long to wait
 | |
|  *
 | |
|  * Must be called with the socket locked.
 | |
|  */
 | |
| int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
 | |
| {
 | |
| 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 | |
| 	struct task_struct *tsk = current;
 | |
| 	int done;
 | |
| 
 | |
| 	do {
 | |
| 		int err = sock_error(sk);
 | |
| 		if (err)
 | |
| 			return err;
 | |
| 		if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
 | |
| 			return -EPIPE;
 | |
| 		if (!*timeo_p)
 | |
| 			return -EAGAIN;
 | |
| 		if (signal_pending(tsk))
 | |
| 			return sock_intr_errno(*timeo_p);
 | |
| 
 | |
| 		add_wait_queue(sk_sleep(sk), &wait);
 | |
| 		sk->sk_write_pending++;
 | |
| 		done = sk_wait_event(sk, timeo_p,
 | |
| 				     !sk->sk_err &&
 | |
| 				     !((1 << sk->sk_state) &
 | |
| 				       ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait);
 | |
| 		remove_wait_queue(sk_sleep(sk), &wait);
 | |
| 		sk->sk_write_pending--;
 | |
| 	} while (!done);
 | |
| 	return 0;
 | |
| }
 | |
| EXPORT_SYMBOL(sk_stream_wait_connect);
 | |
| 
 | |
| /**
 | |
|  * sk_stream_closing - Return 1 if we still have things to send in our buffers.
 | |
|  * @sk: socket to verify
 | |
|  */
 | |
| static inline int sk_stream_closing(struct sock *sk)
 | |
| {
 | |
| 	return (1 << sk->sk_state) &
 | |
| 	       (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
 | |
| }
 | |
| 
 | |
| void sk_stream_wait_close(struct sock *sk, long timeout)
 | |
| {
 | |
| 	if (timeout) {
 | |
| 		DEFINE_WAIT_FUNC(wait, woken_wake_function);
 | |
| 
 | |
| 		add_wait_queue(sk_sleep(sk), &wait);
 | |
| 
 | |
| 		do {
 | |
| 			if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait))
 | |
| 				break;
 | |
| 		} while (!signal_pending(current) && timeout);
 | |
| 
 | |
| 		remove_wait_queue(sk_sleep(sk), &wait);
 | |
| 	}
 | |
| }
 | |
| EXPORT_SYMBOL(sk_stream_wait_close);
 | |
| 
 | |
| /**
 | |
|  * sk_stream_wait_memory - Wait for more memory for a socket
 | |
|  * @sk: socket to wait for memory
 | |
|  * @timeo_p: for how long
 | |
|  */
 | |
| int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 | |
| {
 | |
| 	int err = 0;
 | |
| 	long vm_wait = 0;
 | |
| 	long current_timeo = *timeo_p;
 | |
| 	bool noblock = (*timeo_p ? false : true);
 | |
| 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 | |
| 
 | |
| 	if (sk_stream_memory_free(sk))
 | |
| 		current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
 | |
| 
 | |
| 	add_wait_queue(sk_sleep(sk), &wait);
 | |
| 
 | |
| 	while (1) {
 | |
| 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 | |
| 
 | |
| 		if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 | |
| 			goto do_error;
 | |
| 		if (!*timeo_p) {
 | |
| 			if (noblock)
 | |
| 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 | |
| 			goto do_nonblock;
 | |
| 		}
 | |
| 		if (signal_pending(current))
 | |
| 			goto do_interrupted;
 | |
| 		sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 | |
| 		if (sk_stream_memory_free(sk) && !vm_wait)
 | |
| 			break;
 | |
| 
 | |
| 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 | |
| 		sk->sk_write_pending++;
 | |
| 		sk_wait_event(sk, ¤t_timeo, sk->sk_err ||
 | |
| 						  (sk->sk_shutdown & SEND_SHUTDOWN) ||
 | |
| 						  (sk_stream_memory_free(sk) &&
 | |
| 						  !vm_wait), &wait);
 | |
| 		sk->sk_write_pending--;
 | |
| 
 | |
| 		if (vm_wait) {
 | |
| 			vm_wait -= current_timeo;
 | |
| 			current_timeo = *timeo_p;
 | |
| 			if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
 | |
| 			    (current_timeo -= vm_wait) < 0)
 | |
| 				current_timeo = 0;
 | |
| 			vm_wait = 0;
 | |
| 		}
 | |
| 		*timeo_p = current_timeo;
 | |
| 	}
 | |
| out:
 | |
| 	remove_wait_queue(sk_sleep(sk), &wait);
 | |
| 	return err;
 | |
| 
 | |
| do_error:
 | |
| 	err = -EPIPE;
 | |
| 	goto out;
 | |
| do_nonblock:
 | |
| 	err = -EAGAIN;
 | |
| 	goto out;
 | |
| do_interrupted:
 | |
| 	err = sock_intr_errno(*timeo_p);
 | |
| 	goto out;
 | |
| }
 | |
| EXPORT_SYMBOL(sk_stream_wait_memory);
 | |
| 
 | |
| int sk_stream_error(struct sock *sk, int flags, int err)
 | |
| {
 | |
| 	if (err == -EPIPE)
 | |
| 		err = sock_error(sk) ? : -EPIPE;
 | |
| 	if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
 | |
| 		send_sig(SIGPIPE, current, 0);
 | |
| 	return err;
 | |
| }
 | |
| EXPORT_SYMBOL(sk_stream_error);
 | |
| 
 | |
| void sk_stream_kill_queues(struct sock *sk)
 | |
| {
 | |
| 	/* First the read buffer. */
 | |
| 	__skb_queue_purge(&sk->sk_receive_queue);
 | |
| 
 | |
| 	/* Next, the error queue. */
 | |
| 	__skb_queue_purge(&sk->sk_error_queue);
 | |
| 
 | |
| 	/* Next, the write queue. */
 | |
| 	WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
 | |
| 
 | |
| 	/* Account for returned memory. */
 | |
| 	sk_mem_reclaim(sk);
 | |
| 
 | |
| 	WARN_ON(sk->sk_wmem_queued);
 | |
| 	WARN_ON(sk->sk_forward_alloc);
 | |
| 
 | |
| 	/* It is _impossible_ for the backlog to contain anything
 | |
| 	 * when we get here.  All user references to this socket
 | |
| 	 * have gone away, only the net layer knows can touch it.
 | |
| 	 */
 | |
| }
 | |
| EXPORT_SYMBOL(sk_stream_kill_queues);
 |