linux源码解读（二十）：网络通信简介——socket&sock结构体介绍

batsom · 2022-10-11 12:43:39

linux下的网络编程离不开socket，中文被翻译为套接字。任何网络通信都必须先建立socket，再通过socket给对方收发数据！数据接受的demo代码如下：

#include <string.h>
#include <sys/socket.h>
#include <sys/types.h>
#define SET_PORT 3490
int main(void)
{
    int sockfd, new_fd;
    struct sockaddr_in my_addr;
    struct sockaddr_in their_addr;
    int sin_size;
    sockfd = socket(PF_INET, SOCK_STREAM, 0);
    my_addr.sin_family = AF_INET;
    my_addr.sin_port = htons(_INT_PORT);
    my_addr.sin_addr.s_addr = INADDR_ANY;
    bzero(&(my_addr.sin_zero),sizeof(my_addr.sin_zero));
    bind(sockfd, (struct sockaddr *)&my_addr,sizeof(struct sockaddr));// 绑定套接字
    listen(sockfd, 10);                                                     // 监听套接字
    sin_size = sizeof(struct sockaddr_in);
    new_fd = accept(sockfd, &their_addr, &sin_size);                        // 接收套接字
}

可以看出，需要先调用socket函数建立socket，再绑定套接字，最后监听和接受数据。这个socket到底是啥？linux在内核中又是怎么使用的了？

1、（1）socket是个结构体，字段不多，但是嵌套了其他结构体，各种嵌套的关系标识如下：

proto_ops：用户层调用的各种接口就是在这里注册的（篇幅有限，截图的字段不全）
wq：等待该socket的进程队列和异步通知队列；换句话说：同一个socket可能有多个进程都在等待使用！
sock：应该是socket结构体最核心的嵌套结构体了（篇幅有限，截图的字段不全）！

FluxBB bbcode 测试

（2）socket结构体有了，接下来就是创建和初始化了！linux内核创建socket的函数是__sock_create，核心代码如下：

int __sock_create(struct net *net, int family, int type, int protocol,
             struct socket **res, int kern)
{
    int err;
    struct socket *sock;
    const struct net_proto_family *pf;
        .........

     /*
     *    Allocate the socket and allow the family to set things up. if
     *    the protocol is 0, the family is instructed to select an appropriate
     *    default.
        本质：创建socket结构体，存放在inode，通过superblock统一检索和管理
     */
    sock = sock_alloc();
        .........
        /*socket就是在这里创建的,实际调用的是inet_create
          af_inet.c文件中：
          static const struct net_proto_family inet_family_ops = {
               .family = PF_INET,
               .create = inet_create,
               .owner    = THIS_MODULE,
    };*/
    err = pf->create(net, sock, protocol, kern);
    ..................
}

创建socket的核心函数就2个:sock_alloc，还有pf->create！先看第一个sock_alloc，代码如下：

/**
 *    sock_alloc    -    allocate a socket
 *
 *    Allocate a new inode and socket object. The two are bound together
 *    and initialised. The socket is then returned. If we are out of inodes
 *    NULL is returned.
    明明是申请socket，底层却分配inode，这是为啥了？
    1、socket也需要管理，放在inode后通过super_bloc统一检索和管理
    2、socket的属性字段自然也存放在inode节点了
    3、符合万物皆文件的理念
 */

struct socket *sock_alloc(void)
{
    struct inode *inode;
    struct socket *sock;
    //从超级块里分配一个inode
    inode = new_inode_pseudo(sock_mnt->mnt_sb);
    if (!inode)
        return NULL;
    /*把inode和socket绑定在一起，通过inode寻址socket，便于管理*/
    sock = SOCKET_I(inode);

    kmemcheck_annotate_bitfield(sock, type);//标记shadow memory来表示这块内存已经使用了
    inode->i_ino = get_next_ino();
    inode->i_mode = S_IFSOCK | S_IRWXUGO;
    inode->i_uid = current_fsuid();
    inode->i_gid = current_fsgid();
    inode->i_op = &sockfs_inode_ops;

    this_cpu_add(sockets_in_use, 1);
    return sock;
}

本质上就是分配一个inode，然后和socket结构体绑定，通过inode寻址socket结构体！socket结构体有了，接下来就是在socket内部嵌套的sock结构体了！其生成和初始化的工作都是在inet_create内部完成的，代码如下：

static int inet_create(struct net *net, struct socket *sock, int protocol,
               int kern)
{
    struct sock *sk;
    struct inet_protosw *answer;
    struct inet_sock *inet;
    struct proto *answer_prot;
    unsigned char answer_flags;
    int try_loading_module = 0;
    int err;

    if (protocol < 0 || protocol >= IPPROTO_MAX)
        return -EINVAL;

    sock->state = SS_UNCONNECTED;//初始化状态当然设置成未连接了

    /* Look for the requested type/protocol pair. */
lookup_protocol:
    err = -ESOCKTNOSUPPORT;
    rcu_read_lock();
    list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

        err = 0;
        /* Check the non-wild match. */
        if (protocol == answer->protocol) {
            if (protocol != IPPROTO_IP)
                break;
        } else {
            /* Check for the two wild cases. */
            if (IPPROTO_IP == protocol) {
                protocol = answer->protocol;
                break;
            }
            if (IPPROTO_IP == answer->protocol)
                break;
        }
        err = -EPROTONOSUPPORT;
    }

    if (unlikely(err)) {
        if (try_loading_module < 2) {
            rcu_read_unlock();
            /*
             * Be more specific, e.g. net-pf-2-proto-132-type-1
             * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
             */
            if (++try_loading_module == 1)
                request_module("net-pf-%d-proto-%d-type-%d",
                           PF_INET, protocol, sock->type);
            /*
             * Fall back to generic, e.g. net-pf-2-proto-132
             * (net-pf-PF_INET-proto-IPPROTO_SCTP)
             */
            else
                request_module("net-pf-%d-proto-%d",
                           PF_INET, protocol);
            goto lookup_protocol;
        } else
            goto out_rcu_unlock;
    }

    err = -EPERM;
    if (sock->type == SOCK_RAW && !kern &&
        !ns_capable(net->user_ns, CAP_NET_RAW))
        goto out_rcu_unlock;

    sock->ops = answer->ops;
    answer_prot = answer->prot;
    answer_flags = answer->flags;
    rcu_read_unlock();

    WARN_ON(!answer_prot->slab);

    err = -ENOBUFS;
    /*从cpu缓存或堆内存分配空间存储sock实例，并初始化*/
    sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
    if (!sk)
        goto out;

    err = 0;
    if (INET_PROTOSW_REUSE & answer_flags)
        sk->sk_reuse = SK_CAN_REUSE;
    /*
    1、强制转换成inet_sock类型，便于继续初始化；
    2、inet和sk指针并未改变，指向的是同一块内存地址，两个指针可以同时使用
    */
    inet = inet_sk(sk);
    inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

    inet->nodefrag = 0;

    if (SOCK_RAW == sock->type) {
        inet->inet_num = protocol;
        if (IPPROTO_RAW == protocol)
            inet->hdrincl = 1;
    }

    if (net->ipv4.sysctl_ip_no_pmtu_disc)
        inet->pmtudisc = IP_PMTUDISC_DONT;
    else
        inet->pmtudisc = IP_PMTUDISC_WANT;

    inet->inet_id = 0;
    /*
    1、初始化sk_buff的读、写、错误队列
    2、关联socket和sock的实例
    3、定义sock的回调函数
    4、初始化其他sock字段
    */
    sock_init_data(sock, sk);

    sk->sk_destruct       = inet_sock_destruct;//析构时的回调函数
    sk->sk_protocol       = protocol;//协议类型
    sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
    //sk和inet交替使用来初始化
    inet->uc_ttl    = -1;
    inet->mc_loop    = 1;
    inet->mc_ttl    = 1;
    inet->mc_all    = 1;
    inet->mc_index    = 0;
    inet->mc_list    = NULL;
    inet->rcv_tos    = 0;

    sk_refcnt_debug_inc(sk);//引用计数+1

    if (inet->inet_num) {
        /* It assumes that any protocol which allows
         * the user to assign a number at socket
         * creation time automatically
         * shares.
         */
        inet->inet_sport = htons(inet->inet_num);
        /* Add to protocol hash chains. */
        err = sk->sk_prot->hash(sk);
        if (err) {
            sk_common_release(sk);
            goto out;
        }
    }

    if (sk->sk_prot->init) {
        err = sk->sk_prot->init(sk);
        if (err)
            sk_common_release(sk);
    }
out:
    return err;
out_rcu_unlock:
    rcu_read_unlock();
    goto out;
}

整个逻辑并不复杂，先是调用sk_alloc函数生成sock实例，再调用sock_init_data初始化sock实力，并和socket实例关联，所以我个人认为sock_init_data是最核心的函数，如下:

/*
1、初始化sk_buff的读、写、错误队列
2、关联socket和sock的实例
3、定义sock的回调函数
4、初始化其他sock字段
*/
void sock_init_data(struct socket *sock, struct sock *sk)
{
    /*初始化sk_buff的读写、错误队列*/
    skb_queue_head_init(&sk->sk_receive_queue);
    skb_queue_head_init(&sk->sk_write_queue);
    skb_queue_head_init(&sk->sk_error_queue);

    sk->sk_send_head    =    NULL;
    //初始化定时器
    init_timer(&sk->sk_timer);

    sk->sk_allocation    =    GFP_KERNEL;
    sk->sk_rcvbuf        =    sysctl_rmem_default;
    sk->sk_sndbuf        =    sysctl_wmem_default;
    sk->sk_state        =    TCP_CLOSE;
    //这里终于把socket和sock实例关联起来了
    sk_set_socket(sk, sock);

    sock_set_flag(sk, SOCK_ZAPPED);

    if (sock) {
        sk->sk_type    =    sock->type;
        sk->sk_wq    =    sock->wq;
        sock->sk    =    sk;
    } else
        sk->sk_wq    =    NULL;

    rwlock_init(&sk->sk_callback_lock);
    lockdep_set_class_and_name(&sk->sk_callback_lock,
            af_callback_keys + sk->sk_family,
            af_family_clock_key_strings[sk->sk_family]);

    sk->sk_state_change    =    sock_def_wakeup;//状态改变后的回调函数
    sk->sk_data_ready    =    sock_def_readable;//有数据可读的回调函数
    sk->sk_write_space    =    sock_def_write_space;//有缓存可写的回调函数
    sk->sk_error_report    =    sock_def_error_report;//发生io错误时的回调函数
    sk->sk_destruct        =    sock_def_destruct;

    sk->sk_frag.page    =    NULL;
    sk->sk_frag.offset    =    0;
    sk->sk_peek_off        =    -1;

    sk->sk_peer_pid     =    NULL;
    sk->sk_peer_cred    =    NULL;
    sk->sk_write_pending    =    0;
    sk->sk_rcvlowat        =    1;
    sk->sk_rcvtimeo        =    MAX_SCHEDULE_TIMEOUT;
    sk->sk_sndtimeo        =    MAX_SCHEDULE_TIMEOUT;

    sk->sk_stamp = ktime_set(-1L, 0);

#ifdef CONFIG_NET_RX_BUSY_POLL
    sk->sk_napi_id        =    0;
    sk->sk_ll_usec        =    sysctl_net_busy_read;
#endif

    sk->sk_max_pacing_rate = ~0U;
    sk->sk_pacing_rate = ~0U;
    sk->sk_incoming_cpu = -1;
    /*
     * Before updating sk_refcnt, we must commit prior changes to memory
     * (Documentation/RCU/rculist_nulls.txt for details)
     */
    smp_wmb();
    atomic_set(&sk->sk_refcnt, 1);
    atomic_set(&sk->sk_drops, 0);
}

上面有几个回调函数，其实实现的逻辑的代码结构基本是一样的：

/*
 *    Default Socket Callbacks
 当sock的状态发生改变时，会调用此函数来进行处理
 */

static void sock_def_wakeup(struct sock *sk)
{
    struct socket_wq *wq;

    rcu_read_lock();
    wq = rcu_dereference(sk->sk_wq);
    if (skwq_has_sleeper(wq))//有进程阻塞在这个socket
    //唤醒所有在等待这个socket的进程，核心就是执行进程唤醒的回调函数
        wake_up_interruptible_all(&wq->wait);
    rcu_read_unlock();
}
/*sock有输入数据可读时，会调用此函数来处理*/
static void sock_def_readable(struct sock *sk)
{
    struct socket_wq *wq;

    rcu_read_lock();
    wq = rcu_dereference(sk->sk_wq);
    if (skwq_has_sleeper(wq))
        /* 唤醒等待数据的进程，核心还是执行回调函数 */
        wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
                        POLLRDNORM | POLLRDBAND);
    /* 异步通知队列的处理。
     * 检查应用程序是否通过recv()类调用来等待接收数据，如果没有就发送SIGIO信号，
     * 告知它有数据可读。
     * how为函数的处理方式，band为用来告知的IO类型。
     */
    sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
    rcu_read_unlock();
}

当有可读数据的时候，肯定第一时间通知相应的进程来读取数据，核心是通过sk_wake_async函数实现的；而sk_wake_async最终调用了kill_fasync_rcu来给排队等待的队列发出SIGIO信号，通知这些队列中的进程来取数据了！异步的好处在这里就凸显了：进程不用在这里空转等数据，而是可以释放cpu去执行其他进程的代码；等socket有数据后再通过类似中断的形式通知等待的进程来取数据了！

/*
 * rcu_read_lock() is held
   函数名有kill，但实际是向队列的进程发送SIGIO信号
 */
static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
    while (fa) {
        struct fown_struct *fown;
        unsigned long flags;

        if (fa->magic != FASYNC_MAGIC) {
            printk(KERN_ERR "kill_fasync: bad magic number in "
                   "fasync_struct!\n");
            return;
        }
        spin_lock_irqsave(&fa->fa_lock, flags);
        if (fa->fa_file) {
            fown = &fa->fa_file->f_owner;
            /* Don't send SIGURG to processes which have not set a
               queued signum: SIGURG has its own default signalling
               mechanism. */
            if (!(sig == SIGURG && fown->signum == 0))
                send_sigio(fown, fa->fa_fd, band);
        }
        spin_unlock_irqrestore(&fa->fa_lock, flags);
        fa = rcu_dereference(fa->fa_next);
    }
}

Gentoo中文社区

公告

#1 2022-10-11 12:43:39

linux源码解读（二十）：网络通信简介——socket&sock结构体介绍

页脚