1. 注册Protocol Family
Linux内核在初始化阶段会执行inet_init函数,该函数注册了AF_INET/PF_INET这个Protocol Family以及该Family对应的协议栈(TCP、UDP、ICMP、RAW ),相关代码在net/ipv4/afinet.c。
全局数组net_families[]用于保存已注册的Protocol Family指针。
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
inet_init中通过如下代码将inet_family_ops注册到net_families[]中。
(void)sock_register(&inet_family_ops);
inet_family_ops定义及PF_INET定义如下。
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
// /home/kangxiaoning/workspace/linux-3.10.0-1160/include/linux/socket.h
#define PF_INET AF_INET
#define AF_INET 2 /* Internet IP Protocol */
inet_init执行后, net_families[PF_INET]就指向了inet_family_ops结构体。
2. 创建Socket
2.1 Family匹配过程
在C语言中通过socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)创建一个socket。
调用socketsyscall。
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
调用sock_create创建名为sock的struct socket对象。
retval = sock_create(family, type, protocol, &sock);
在__sock_create函数中通过net_families[family]获取Protocol Family ,根据传入的AF_INET得到inet_family_ops。
pf = rcu_dereference(net_families[family]);
调用pf->create创建socket ,即inet_family_ops中的inet_create函数。
err = pf->create(net, sock, protocol, kern);
至此,确定了family为AF_INET的socket应该调用inet_create函数创建。
2.2 Protocol匹配过程
全局数组inetsw维护socket type和protocol的关系,通过socket type(SOCK_STREAM, SOCK_DGRAM, SOCK_RAW )索引protocol(TCP, UDP, ICMP, RAW)。
/* The inetsw table contains everything that inet_create needs to
* build a new socket.
*/
static struct list_head inetsw[SOCK_MAX];
静态数组inetsw_array[]包含默认protocol信息。
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_REUSE,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
}
};
在inet_init中将inetsw_array[]中的信息注册到inetsw[]中。
/* Register the socket-side information for inet_create. */
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
在inet_create中利用注册的信息完成socket创建。
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
// `sock->ops`赋值为`&inet_stream_ops`
sock->ops = answer->ops;
answer_prot = answer->prot;
answer_flags = answer->flags;
// `sk->sk_prot`赋值为`&tcp_prot`
sk = sk_alloc(net, pf_inet, gfp_kernel, answer_prot);
至此, socket创建完成,协议栈层级结构中不同的层都完成了创始化, socket层与protocol层的关系如下。
User Space Application
↓
socket(2) syscall
↓
┌────────────────────────────────┐
│ proto_ops (inet_stream_ops) │ ← BSD Socket Layer (Generic)
│ - bind(), connect() │
│ - send(), recv() │
└────────────────────────────────┘
↓
┌─────────────────────────────┐
│ proto (tcp_prot) │ ← Protocol-Specific Layer
│ - TCP state machine │
│ - Congestion control │
│ - Segment handling │
└─────────────────────────────┘
↓
IP Layer & Below
3. 操作集
如下列举了常见协议TCP/UDP的Socket操作集与Protocol操作集。
3.1 SOCK_STREAM操作集
inet_stream_ops定义了socket层中SOCK_STREAM类型的操作集。
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
};
3.2 TCP操作集
tcp_prot定义了TCP协议的操作集,是SOCK_STREAM类型的一种具体实现。
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
.init_cgroup = tcp_init_cgroup,
.destroy_cgroup = tcp_destroy_cgroup,
.proto_cgroup = tcp_proto_cgroup,
#endif
};
3.3 SOCK_DGRAM操作集
inet_dgram_ops定义了socket层中SOCK_DGRAM类型的操作集。
const struct proto_ops inet_dgram_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_dgram_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = inet_getname,
.poll = udp_poll,
.ioctl = inet_ioctl,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
};
3.4 UDP操作集
udp_prot定义了UDP协议的操作集,是SOCK_DGRAM类型的一种具体实现。
struct proto udp_prot = {
.name = "UDP",
.owner = THIS_MODULE,
.close = udp_lib_close,
.connect = ip4_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
.init = udp_init_sock,
.destroy = udp_destroy_sock,
.setsockopt = udp_setsockopt,
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem = &sysctl_udp_wmem_min,
.sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.h.udp_table = &udp_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
.compat_getsockopt = compat_udp_getsockopt,
#endif
.clear_sk = sk_prot_clear_portaddr_nulls,
};