Skip to content

相比于fnat ipv4,fnat64 pps性能存在明显下降 #1063

@liuxuting

Description

@liuxuting

问题描述:在小包场景下,相比于v4 fnat,fnat64存在明显的性能下降约30%。基于dperf压测工具和物理机mlx25G网卡进行性能测试,在cps900k,cc10,pkg size 84情况下,测出fnat v4 pps约为620w,fnat64 pps约为470w。
问题分析:结合火焰图进行分析,fnat v4出向dp_vs_out_xmit_fnat函数占比仅0.23%,而fnat64出向过程dp_vs_out_xmit_fnat函数占比1.29%。,如下图所示。

Image Image

从代码分析看,fnat64出向过程中,通过查路由表及走协议栈进行转发,而fnat v4大部分通过fast path卸载到网卡,将数据包直接转发到网关下一跳。代码如下:

static int __dp_vs_out_xmit_fnat4(struct dp_vs_proto *proto,
                                  struct dp_vs_conn *conn,
                                  struct rte_mbuf *mbuf)
{
    struct flow4 fl4;
    struct rte_ipv4_hdr *iph = ip4_hdr(mbuf);
    struct route_entry *rt;
    int err, mtu;

    if (!fast_xmit_close && !(conn->flags & DPVS_CONN_F_NOFASTXMIT)) {
        dp_vs_save_outxmit_info(mbuf, proto, conn);
        if (!dp_vs_fast_outxmit_fnat(AF_INET, proto, conn, mbuf)) {
            return EDPVS_OK;
        }
    }

    ...
}
static int __dp_vs_out_xmit_fnat46(struct dp_vs_proto *proto,
                                   struct dp_vs_conn *conn,
                                   struct rte_mbuf *mbuf)
{
    struct flow6 fl6;
    struct rte_ipv4_hdr *ip4h = ip4_hdr(mbuf);
    uint32_t pkt_len;
    struct route6 *rt6;
    int err, mtu;

    /*
     * drop old route. just for safe, because
     * FNAT is PRE_ROUTING, should not have route.
     */
    if (unlikely(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) != NULL)) {
        RTE_LOG(WARNING, IPVS, "%s: FNAT have route %p ?\n", __func__,
                MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE));
        route4_put(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE));
    }

    memset(&fl6, 0, sizeof(struct flow6));
    fl6.fl6_daddr = conn->caddr.in6;
    fl6.fl6_saddr = conn->vaddr.in6;
    rt6 = route6_output(mbuf, &fl6);
    if (!rt6) {
        err = EDPVS_NOROUTE;
        goto errout;
    }

    /*
     * didn't cache the pointer to rt
     * or route can't be deleted when there is conn ref
     * this is for neighbour confirm
     */
    // 计算下一跳
    dp_vs_conn_cache_rt6(conn, rt6, false);

    /*
     * mbuf is from IPv6, icmp should send by icmp6
     * ext_hdr and
     */
    // 计算mtu
    mtu = rt6->rt6_mtu;
    pkt_len = mbuf_nat4to6_len(mbuf);
    if (pkt_len > mtu
           && (ip4h->fragment_offset & htons(RTE_IPV4_HDR_DF_FLAG))) {
        RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__);
        icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(mtu));
        err = EDPVS_FRAG;
        goto errout;
    }

    MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt6;
    /* after route lookup and before translation */
    if (xmit_ttl) {
        if (unlikely(ip4h->time_to_live <= 1)) {
            icmp_send(mbuf, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
            err = EDPVS_DROP;
            goto errout;
        }
        ip4h->time_to_live--;
    }

    /* pre-handler before translation */
    if (proto->fnat_out_pre_handler) {
        err = proto->fnat_out_pre_handler(proto, conn, mbuf);
        if (err != EDPVS_OK)
            goto errout;
    }

    /* L3 translation before l4 re-csum */
    err = mbuf_4to6(mbuf, &conn->vaddr.in6, &conn->caddr.in6);
    if (err)
        goto errout;

    /* L4 FNAT translation */
    if (proto->fnat_out_handler) {
        err = proto->fnat_out_handler(proto, conn, mbuf);
        if (err != EDPVS_OK)
            goto errout;
    }

    return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf,
                     NULL, rt6->rt6_dev, ip6_output);

errout:
    if (rt6)
        route6_put(rt6);
    rte_pktmbuf_free(mbuf);
    return err;
}

解决方案:
针对于上述问题,将fnat64出向过程简化:首先,将ipv4头替换为ipv6头;然后,直接由网卡转发到下一跳即网关。实现代码如下:

static int __dp_vs_out_xmit_fnat46(struct dp_vs_proto *proto,
                                   struct dp_vs_conn *conn,
                                   struct rte_mbuf *mbuf)
{
    struct flow6 fl6;
    struct rte_ipv4_hdr *ip4h = ip4_hdr(mbuf);
    uint32_t pkt_len;
    struct route6 *rt6;
    int err, mtu;
    if (!fast_xmit_close && !(conn->flags & DPVS_CONN_F_NOFASTXMIT)) {
        dp_vs_save_outxmit_info(mbuf, proto, conn);
        err = __dp_vs_fast_outxmit_fnat4to6(proto, conn, mbuf);
        if (!err) {
            return EDPVS_OK;
        } else {
            RTE_LOG(ERR, IPVS, "err: %d.\n", err);
        }
    }
  ...
  }
static int __dp_vs_fast_outxmit_fnat4to6(struct dp_vs_proto *proto,
                                      struct dp_vs_conn *conn,
                                      struct rte_mbuf *mbuf)
{
    struct rte_ether_hdr *eth;
    uint16_t packet_type = RTE_ETHER_TYPE_IPV6;
    int err;

    if (unlikely(conn->out_dev == NULL))
        return EDPVS_NOROUTE;

    if (unlikely(rte_is_zero_ether_addr(&conn->out_dmac) ||
                 rte_is_zero_ether_addr(&conn->out_smac)))
        return EDPVS_NOTSUPP;

    /* pre-handler before translation */
    if (proto->fnat_out_pre_handler) {
        err = proto->fnat_out_pre_handler(proto, conn, mbuf);
        if (err != EDPVS_OK)
            return err;

        /*
         * re-fetch IP header
         * the offset may changed during pre-handler
         */
    }
    err = mbuf_4to6(mbuf, &conn->vaddr.in6, &conn->caddr.in6);
    if (err)
        return err;

    if(proto->fnat_out_handler) {
        err = proto->fnat_out_handler(proto, conn, mbuf);
        if(err != EDPVS_OK)
            return err;
    }

    eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf,
                    (uint16_t)sizeof(struct rte_ether_hdr));
    rte_ether_addr_copy(&conn->out_dmac, &eth->d_addr);
    rte_ether_addr_copy(&conn->out_smac, &eth->s_addr);
    eth->ether_type = rte_cpu_to_be_16(packet_type);
    mbuf->packet_type = packet_type;

    err = netif_xmit(mbuf, conn->out_dev);
    if (err != EDPVS_OK)
        RTE_LOG(DEBUG, IPVS, "%s: fail to netif_xmit.\n", __func__);

    /* must return OK since netif_xmit alway consume mbuf */
    return EDPVS_OK;
}

在cps900k,cc10,pkg size 84情况下,测出fnat64 pps 520w。相比于fnat v4,优化后的fnat64转发性能下降了16%;相比于优化前fnat64,优化后的fnat64转发性能提升11%,火焰图如下:

Image

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions