Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 | /* * Copyright (c) 2017-2023 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* $FreeBSD: src/sys/netinet6/frag6.c,v 1.2.2.5 2001/07/03 11:01:50 ume Exp $ */ /* $KAME: frag6.c,v 1.31 2001/05/17 13:45:34 jinmei Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * @file * flowswitch IP Reassembly for both v4 and v6 * * Implementation of IP packet fragmentation and reassembly. * */ #include <sys/domain.h> #include <netinet/in.h> #include <netinet/ip6.h> #include <netinet/icmp6.h> #include <skywalk/os_skywalk_private.h> #include <skywalk/nexus/flowswitch/nx_flowswitch.h> #include <skywalk/nexus/flowswitch/fsw_var.h> #define IPFM_MAX_FRAGS_PER_QUEUE 128 /* RFC 791 64K/(512 min MTU) */ #define IPFM_MAX_QUEUES 1024 /* same as ip/ip6 */ #define IPFM_FRAG_TTL 60 /* RFC 2460 */ #define IPFM_TIMEOUT_TCALL_INTERVAL 1 static uint32_t ipfm_max_frags_per_queue = IPFM_MAX_FRAGS_PER_QUEUE; static uint32_t ipfm_frag_ttl = IPFM_FRAG_TTL; static uint32_t ipfm_timeout_tcall_ival = IPFM_TIMEOUT_TCALL_INTERVAL; #if (DEVELOPMENT || DEBUG) SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO, ipfm_max_frags_per_queue, CTLFLAG_RW | CTLFLAG_LOCKED, &ipfm_max_frags_per_queue, 0, ""); #endif /* !DEVELOPMENT && !DEBUG */ SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO, ipfm_frag_ttl, CTLFLAG_RW | CTLFLAG_LOCKED, &ipfm_frag_ttl, 0, ""); SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO, ipfm_timeout_tcall_ival, CTLFLAG_RW | CTLFLAG_LOCKED, &ipfm_timeout_tcall_ival, 0, ""); static LCK_GRP_DECLARE(fsw_ipfm_lock_group, "sk_fsw_ipfm_lock"); static LCK_ATTR_DECLARE(fsw_ipfm_lock_attr, 0, 0); /* @internal ip fragment wrapper (chained in an ipfq) for __kern_packet */ struct ipf { struct ipf *ipf_down; struct ipf *ipf_up; struct __kern_packet *ipf_pkt; int ipf_len; /* fragmentable part length */ int ipf_off; /* fragment offset */ uint16_t ipf_mff; /* more fragment bit in frag off */ }; /* @internal ip fragment lookup key */ struct ipf_key { uint64_t ipfk_addr[4]; /* src + dst ip addr (v4/v6) */ uint32_t ipfk_ident; /* IP identification */ uint16_t ipfk_len; /* len of ipfk_addr field */ }; enum { IPFK_LEN_V4 = 2 * sizeof(struct in_addr), IPFK_LEN_V6 = 2 * sizeof(struct in6_addr), }; /* * @internal * IP reassembly queue structure. Each fragment (struct ipf) * being reassembled is attached to one of these structures. */ struct ipfq { struct ipf *ipfq_down; /* fragment chain */ struct ipf *ipfq_up; struct ipfq *ipfq_next; /* queue chain */ struct ipfq *ipfq_prev; uint64_t ipfq_timestamp; /* time of creation */ struct ipf_key ipfq_key; /* ipfq search key */ uint16_t ipfq_nfrag; /* # of fragments in queue */ int ipfq_unfraglen; /* len of unfragmentable part */ bool ipfq_is_dirty; /* q is dirty, don't use */ }; /* * @internal (externally opaque) * flowswitch IP Fragment Manager */ struct fsw_ip_frag_mgr { struct skoid ipfm_skoid; struct ipfq ipfm_q; /* ip reassembly queues */ uint32_t ipfm_q_limit; /* limit # of reass queues */ uint32_t ipfm_q_count; /* # of allocated reass queues */ uint32_t ipfm_f_limit; /* limit # of ipfs */ uint32_t ipfm_f_count; /* current # of allocated ipfs */ decl_lck_mtx_data(, ipfm_lock); /* guard reass and timeout cleanup */ thread_call_t ipfm_timeout_tcall; /* frag timeout thread */ struct ifnet *ipfm_ifp; struct fsw_stats *ipfm_stats; /* indirect stats in fsw */ }; static int ipf_process(struct fsw_ip_frag_mgr *, struct __kern_packet **, struct ipf_key *, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t *, uint16_t *); static int ipf_key_cmp(struct ipf_key *, struct ipf_key *); static void ipf_enq(struct ipf *, struct ipf *); static void ipf_deq(struct ipf *); static void ipfq_insque(struct ipfq *, struct ipfq *); static void ipfq_remque(struct ipfq *); static uint32_t ipfq_freef(struct fsw_ip_frag_mgr *mgr, struct ipfq *, void (*)(struct fsw_ip_frag_mgr *, struct ipf *)); static void ipfq_timeout(thread_call_param_t, thread_call_param_t); static void ipfq_sched_timeout(struct fsw_ip_frag_mgr *, boolean_t); static struct ipfq *ipfq_alloc(struct fsw_ip_frag_mgr *mgr); static void ipfq_free(struct fsw_ip_frag_mgr *mgr, struct ipfq *q); static uint32_t ipfq_freefq(struct fsw_ip_frag_mgr *mgr, struct ipfq *q, void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *)); static struct ipf *ipf_alloc(struct fsw_ip_frag_mgr *mgr); static void ipf_free(struct fsw_ip_frag_mgr *mgr, struct ipf *f); static void ipf_free_pkt(struct ipf *f); static void ipfq_drain(struct fsw_ip_frag_mgr *mgr); static void ipfq_reap(struct fsw_ip_frag_mgr *mgr); static int ipfq_drain_sysctl SYSCTL_HANDLER_ARGS; void ipf_icmp_param_err(struct fsw_ip_frag_mgr *, struct __kern_packet *pkt, int param); void ipf_icmp_timeout_err(struct fsw_ip_frag_mgr *, struct ipf *f); /* Create a flowswitch IP fragment manager. */ struct fsw_ip_frag_mgr * fsw_ip_frag_mgr_create(struct nx_flowswitch *fsw, struct ifnet *ifp, size_t f_limit) { struct fsw_ip_frag_mgr *mgr; ASSERT(ifp != NULL); mgr = sk_alloc_type(struct fsw_ip_frag_mgr, Z_WAITOK | Z_NOFAIL, skmem_tag_fsw_frag_mgr); mgr->ipfm_q.ipfq_next = mgr->ipfm_q.ipfq_prev = &mgr->ipfm_q; lck_mtx_init(&mgr->ipfm_lock, &fsw_ipfm_lock_group, &fsw_ipfm_lock_attr); mgr->ipfm_timeout_tcall = thread_call_allocate_with_options(ipfq_timeout, mgr, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE); VERIFY(mgr->ipfm_timeout_tcall != NULL); mgr->ipfm_ifp = ifp; mgr->ipfm_stats = &fsw->fsw_stats; /* Use caller provided limit (caller knows pool size) */ ASSERT(f_limit >= 2 && f_limit < UINT32_MAX); mgr->ipfm_f_limit = (uint32_t)f_limit; mgr->ipfm_f_count = 0; mgr->ipfm_q_limit = MIN(IPFM_MAX_QUEUES, mgr->ipfm_f_limit / 2); mgr->ipfm_q_count = 0; skoid_create(&mgr->ipfm_skoid, SKOID_DNODE(fsw->fsw_skoid), "ipfm", 0); skoid_add_uint(&mgr->ipfm_skoid, "frag_limit", CTLFLAG_RW, &mgr->ipfm_f_limit); skoid_add_uint(&mgr->ipfm_skoid, "frag_count", CTLFLAG_RD, &mgr->ipfm_f_count); skoid_add_uint(&mgr->ipfm_skoid, "queue_limit", CTLFLAG_RW, &mgr->ipfm_q_limit); skoid_add_uint(&mgr->ipfm_skoid, "queue_count", CTLFLAG_RD, &mgr->ipfm_q_count); skoid_add_handler(&mgr->ipfm_skoid, "drain", CTLFLAG_RW, ipfq_drain_sysctl, mgr, 0); return mgr; } /* Free a flowswitch IP fragment manager. */ void fsw_ip_frag_mgr_destroy(struct fsw_ip_frag_mgr *mgr) { thread_call_t __single tcall; lck_mtx_lock(&mgr->ipfm_lock); if ((tcall = mgr->ipfm_timeout_tcall) != NULL) { lck_mtx_unlock(&mgr->ipfm_lock); (void) thread_call_cancel_wait(tcall); (void) thread_call_free(tcall); mgr->ipfm_timeout_tcall = NULL; lck_mtx_lock(&mgr->ipfm_lock); } ipfq_drain(mgr); lck_mtx_unlock(&mgr->ipfm_lock); lck_mtx_destroy(&mgr->ipfm_lock, &fsw_ipfm_lock_group); skoid_destroy(&mgr->ipfm_skoid); sk_free_type(struct fsw_ip_frag_mgr, mgr); } /* * Reassemble a received IPv4 fragment. * * @param mgr * fragment manager * @param pkt * received packet (must have ipv4 header validated) * @param ip4 * pointer to the packet's IPv4 header * @param nfrags * number of fragments reassembled * @return * Successfully processed (not fully reassembled) * ret = 0, *pkt = NULL(ipfm owns it), *nfrags=0 * Successfully reassembled * ret = 0, *pkt = 1st fragment(fragments chained in order by pkt_nextpkt) * *nfrags = number of all fragments (>0) * Error * ret != 0 && *pkt unmodified (caller to decide what to do with *pkt) * *nfrags = 0 */ int fsw_ip_frag_reass_v4(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt, struct ip *ip4, uint16_t *nfrags, uint16_t *tlen) { struct ipf_key key; uint16_t unfragpartlen, offflag, fragoff, fragpartlen, fragflag; int err; uint8_t *src; STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V4); src = (uint8_t *)(struct ip *__bidi_indexable)ip4 + offsetof(struct ip, ip_src); bcopy(src, (void *)key.ipfk_addr, IPFK_LEN_V4); key.ipfk_len = IPFK_LEN_V4; key.ipfk_ident = (uint32_t)ip4->ip_id; unfragpartlen = (uint16_t)(ip4->ip_hl << 2); offflag = ntohs(ip4->ip_off); fragoff = (uint16_t)(offflag << 3); fragpartlen = ntohs(ip4->ip_len) - (uint16_t)(ip4->ip_hl << 2); fragflag = offflag & IP_MF; err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen, fragflag, nfrags, tlen); /* * If packet has been reassembled compute the user data length. */ if (*pkt != NULL) { struct __kern_packet *p = *pkt; struct ip *__single iph = __unsafe_forge_single(struct ip *, (struct ip *)p->pkt_flow_ip_hdr); p->pkt_flow_ulen = ntohs(iph->ip_len) - p->pkt_flow_ip_hlen - p->pkt_flow->flow_l4._l4_hlen; } return err; } /* * Reassemble a received IPv6 fragment. * * @param mgr * fragment manager * @param pkt * received packet (must have ipv6 header validated) * @param ip6 * pointer to the packet's IPv6 header * @param ip6f * pointer to the packet's IPv6 Fragment Header * @param nfrags * number of fragments reassembled * @return * Successfully processed (not fully reassembled) * ret = 0, *pkt = NULL(ipfm owns it), *nfrags=0 * Successfully reassembled * ret = 0, *pkt = 1st fragment(fragments chained in ordrer by pkt_nextpkt) * *nfrags = number of all fragments (>0) * Error * ret != 0 && *pkt unmodified (caller to decide what to do with *pkt) * *nfrags = 0 */ int fsw_ip_frag_reass_v6(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt, struct ip6_hdr *ip6, struct ip6_frag *ip6f, uint16_t *nfrags, uint16_t *tlen) { struct ipf_key key; ptrdiff_t ip6f_ptroff = (uintptr_t)ip6f - (uintptr_t)ip6; uint16_t ip6f_off, fragoff, fragpartlen, unfragpartlen, fragflag; int err; uint8_t *src; STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V6); /* jumbo payload can't contain a fragment header */ if (ip6->ip6_plen == 0) { *nfrags = 0; return ERANGE; } ASSERT(ip6f_ptroff < UINT16_MAX); ip6f_off = (uint16_t)ip6f_ptroff; fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK); fragpartlen = ntohs(ip6->ip6_plen) - (ip6f_off + sizeof(struct ip6_frag) - sizeof(struct ip6_hdr)); unfragpartlen = ip6f_off; fragflag = ip6f->ip6f_offlg & IP6F_MORE_FRAG; /* * RFC 6946: Handle "atomic" fragments (offset and m bit set to 0) * upfront, unrelated to any reassembly. * * Flow classifier should process those as non-frag, ipfm shouldn't see * them. */ ASSERT((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) != 0); src = (uint8_t *)(struct ip6_hdr *__bidi_indexable)ip6 + offsetof(struct ip6_hdr, ip6_src); bcopy(src, (void *)key.ipfk_addr, IPFK_LEN_V6); key.ipfk_len = IPFK_LEN_V6; key.ipfk_ident = ip6f->ip6f_ident; err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen, fragflag, nfrags, tlen); /* * If packet has been reassembled compute the user data length. */ if (*pkt != NULL) { struct __kern_packet *p = *pkt; struct ip6_hdr *__single ip6h = __unsafe_forge_single(struct ip6_hdr *, (struct ip6_hdr *)p->pkt_flow_ip_hdr); p->pkt_flow_ulen = ntohs(ip6h->ip6_plen) - p->pkt_flow->flow_l4._l4_hlen; } return err; } static struct mbuf * ipf_pkt2mbuf(struct fsw_ip_frag_mgr *mgr, struct __kern_packet *pkt) { unsigned int one = 1; struct mbuf *__single m = NULL; struct mbuf *pkt_mbuf = pkt->pkt_mbuf; uint8_t *buf; struct ip6_hdr *ip6; uint32_t l3t_len; int err; l3t_len = pkt->pkt_length - pkt->pkt_l2_len; if (pkt->pkt_link_flags & PKT_LINKF_ETHFCS) { l3t_len -= ETHER_CRC_LEN; } err = mbuf_allocpacket(MBUF_WAITOK, l3t_len, &one, &m); VERIFY(err == 0); ASSERT(l3t_len <= mbuf_maxlen(m)); if (pkt->pkt_pflags & PKT_F_MBUF_DATA) { if ((pkt_mbuf->m_len < l3t_len) && (pkt_mbuf = m_pullup(pkt->pkt_mbuf, l3t_len)) == NULL) { return NULL; } else { pkt->pkt_mbuf = pkt_mbuf; bcopy(m_mtod_current(pkt->pkt_mbuf) + pkt->pkt_l2_len, m_mtod_current(m), l3t_len); } } else { MD_BUFLET_ADDR_ABS(pkt, buf); buf += (pkt->pkt_headroom + pkt->pkt_l2_len); bcopy(buf, m_mtod_current(m), l3t_len); } m->m_pkthdr.len = m->m_len = l3t_len; ip6 = mtod(m, struct ip6_hdr *); /* note for casting: IN6_IS_SCOPE_ doesn't need alignment */ if (IN6_IS_SCOPE_LINKLOCAL((struct in6_addr *)(uintptr_t)&ip6->ip6_src)) { if (in6_embedded_scope) { ip6->ip6_src.s6_addr16[1] = htons(mgr->ipfm_ifp->if_index); } ip6_output_setsrcifscope(m, mgr->ipfm_ifp->if_index, NULL); } if (IN6_IS_SCOPE_EMBED((struct in6_addr *)(uintptr_t)&ip6->ip6_dst)) { if (in6_embedded_scope) { ip6->ip6_dst.s6_addr16[1] = htons(mgr->ipfm_ifp->if_index); } ip6_output_setdstifscope(m, mgr->ipfm_ifp->if_index, NULL); } return m; } /* * Since this function can be called while holding fsw_ip_frag_mgr.ipfm_lock, * we need to ensure we don't enter the driver directly because a deadlock * can happen if this same thread tries to get the workloop lock. */ static void ipf_icmp6_error_flag(struct mbuf *m, int type, int code, int param, int flags) { sk_protect_t protect = sk_async_transmit_protect(); icmp6_error_flag(m, type, code, param, flags); sk_async_transmit_unprotect(protect); } /* * @internal IP fragment ICMP parameter problem error handling * * @param param * offending parameter offset, only applicable to ICMPv6 */ void ipf_icmp_param_err(struct fsw_ip_frag_mgr *mgr, struct __kern_packet *pkt, int param_offset) { if (pkt->pkt_flow_ip_ver != IPV6_VERSION) { return; } struct mbuf *m = NULL; m = ipf_pkt2mbuf(mgr, pkt); if (__probable(m != NULL)) { ipf_icmp6_error_flag(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, param_offset, 0); } /* m would be free by icmp6_error_flag function */ } /* @internal IP fragment ICMP timeout error handling */ void ipf_icmp_timeout_err(struct fsw_ip_frag_mgr *mgr, struct ipf *f) { struct __kern_packet *pkt = f->ipf_pkt; ASSERT(pkt != NULL); /* no icmp error packet for ipv4 */ if (pkt->pkt_flow_ip_ver != IPV6_VERSION) { return; } /* only for the first fragment */ if (f->ipf_off != 0) { return; } struct mbuf *m = NULL; m = ipf_pkt2mbuf(mgr, pkt); if (__probable(m != NULL)) { ipf_icmp6_error_flag(m, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_REASSEMBLY, 0, 0); } /* m would be free by icmp6_error_flag function */ } /* @internal IP fragment processing, v4/v6 agonistic */ int ipf_process(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt_ptr, struct ipf_key *key, uint16_t unfraglen, uint16_t fragoff, uint16_t fragpartlen, uint16_t fragflag, uint16_t *nfrags, uint16_t *tlen) { struct __kern_packet *pkt = *pkt_ptr; struct __kern_packet *pkt_reassed = NULL; struct ipfq *q, *mq = &mgr->ipfm_q; struct ipf *f, *f_new, *f_down; uint32_t nfrags_freed; int next; int first_frag = 0; int err = 0; int local_ipfq_unfraglen; *nfrags = 0; SK_DF(SK_VERB_IP_FRAG, "id %5d fragoff %5d fragpartlen %5d " "fragflag 0x%x", key->ipfk_ident, fragoff, fragpartlen, fragflag); /* * Make sure that all fragments except last one have a data length * that's a non-zero multiple of 8 bytes. */ if (fragflag && (fragpartlen == 0 || (fragpartlen & 0x7) != 0)) { SK_DF(SK_VERB_IP_FRAG, "frag not multiple of 8 bytes"); STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_BAD_LEN); ipf_icmp_param_err(mgr, pkt, offsetof(struct ip6_hdr, ip6_plen)); return ERANGE; } lck_mtx_lock(&mgr->ipfm_lock); /* find ipfq */ for (q = mq->ipfq_next; q != mq; q = q->ipfq_next) { if (ipf_key_cmp(key, &q->ipfq_key) == 0) { if (q->ipfq_is_dirty) { SK_DF(SK_VERB_IP_FRAG, "found dirty q, skip"); err = EINVAL; goto done; } break; } } /* not found, create new ipfq */ if (q == mq) { first_frag = 1; q = ipfq_alloc(mgr); if (q == NULL) { STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_NOMEM); err = ENOMEM; goto done; } ipfq_insque(q, mq); net_update_uptime(); bcopy(key, &q->ipfq_key, sizeof(struct ipf_key)); q->ipfq_down = q->ipfq_up = (struct ipf *)q; q->ipfq_unfraglen = -1; /* The 1st fragment has not arrived. */ q->ipfq_nfrag = 0; q->ipfq_timestamp = _net_uptime; } ASSERT(!q->ipfq_is_dirty); /* this queue has reached per queue frag limit */ if (q->ipfq_nfrag > ipfm_max_frags_per_queue) { nfrags_freed = ipfq_freefq(mgr, q, NULL); STATS_ADD(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_PER_QUEUE_LIMIT, nfrags_freed); err = ENOMEM; goto done; } local_ipfq_unfraglen = q->ipfq_unfraglen; /* * If it's the 1st fragment, record the length of the * unfragmentable part and the next header of the fragment header. * Assume the first fragement to arrive will be correct. * We do not have any duplicate checks here yet so another packet * with fragoff == 0 could come and overwrite the ipfq_unfraglen * and worse, the next header, at any time. */ if (fragoff == 0 && local_ipfq_unfraglen == -1) { local_ipfq_unfraglen = unfraglen; } /* Check that the reassembled packet would not exceed 65535 bytes. */ if (local_ipfq_unfraglen + fragoff + fragpartlen > IP_MAXPACKET) { SK_DF(SK_VERB_IP_FRAG, "frag too big"); STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD); ipf_icmp_param_err(mgr, pkt, sizeof(struct ip6_hdr) + offsetof(struct ip6_frag, ip6f_offlg)); err = ERANGE; goto done; } /* * If it's the 1st fragment, do the above check for each * fragment already stored in the reassembly queue. * If an error is found, still return 0, since we don't return * ownership of a chain of offending packets back to caller. */ if (fragoff == 0) { for (f = q->ipfq_down; f != (struct ipf *)q; f = f_down) { f_down = f->ipf_down; if (local_ipfq_unfraglen + f->ipf_off + f->ipf_len > IP_MAXPACKET) { SK_DF(SK_VERB_IP_FRAG, "frag too big"); STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD); ipf_deq(f); ipf_free_pkt(f); ipf_free(mgr, f); } } } f_new = ipf_alloc(mgr); if (f_new == NULL) { STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_NOMEM); err = ENOMEM; goto done; } f_new->ipf_mff = fragflag; f_new->ipf_off = fragoff; f_new->ipf_len = fragpartlen; f_new->ipf_pkt = pkt; if (first_frag) { f = (struct ipf *)q; goto insert; } /* Find a segment which begins after this one does. */ for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) { if (f->ipf_off > f_new->ipf_off) { break; } } /* * If any of the fragments being reassembled overlap with any * other fragments being reassembled for the same packet, * reassembly of that packet must be abandoned and all the * fragments that have been received for that packet must be * discarded, and no ICMP error messages should be sent. * * It should be noted that fragments may be duplicated in the * network. Instead of treating these exact duplicate fragments * as overlapping fragments, an implementation may choose to * detect this case and drop exact duplicate fragments while * keeping the other fragments belonging to the same packet. * * https://tools.ietf.org/html/rfc8200#appendix-B * * We apply this rule for both for IPv4 and IPv6 here. */ if (((f->ipf_up != (struct ipf *)q) && /* prev frag spans into f_new */ (f->ipf_up->ipf_off + f->ipf_up->ipf_len - f_new->ipf_off > 0)) || ((f != (struct ipf *)q) && /* f_new spans into next */ (f_new->ipf_off + f_new->ipf_len - f->ipf_off > 0))) { STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD); /* Check for exact duplicate offset/length */ if (((f->ipf_up != (struct ipf *)q) && ((f->ipf_up->ipf_off != f_new->ipf_off) || (f->ipf_up->ipf_len != f_new->ipf_len))) || ((f != (struct ipf *)q) && ((f->ipf_off != f_new->ipf_off) || (f->ipf_len != f_new->ipf_len)))) { SK_DF(SK_VERB_IP_FRAG, "frag overlap"); ipf_free(mgr, f_new); /* give up over-lapping fragments queue */ SK_DF(SK_VERB_IP_FRAG, "free overlapping queue"); ipfq_freef(mgr, q, NULL); q->ipfq_is_dirty = true; } else { ipf_free(mgr, f_new); SK_DF(SK_VERB_IP_FRAG, "frag dup"); } err = ERANGE; goto done; } insert: q->ipfq_unfraglen = local_ipfq_unfraglen; /* * Stick new segment in its place; * check for complete reassembly. * Move to front of packet queue, as we are * the most recently active fragmented packet. */ ipf_enq(f_new, f->ipf_up); q->ipfq_nfrag++; next = 0; for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) { /* there is a hole */ if (f->ipf_off != next) { goto done; } next += f->ipf_len; } /* we haven't got last frag yet */ if (f->ipf_up->ipf_mff) { goto done; } /* * Reassembly is complete; concatenate fragments. */ f = q->ipfq_down; f_down = f->ipf_down; pkt_reassed = f->ipf_pkt; *nfrags = 1; while (f_down != (struct ipf *)q) { /* chain __kern_packet with pkt_nextpkt ptr */ f->ipf_pkt->pkt_nextpkt = f_down->ipf_pkt; (*nfrags)++; (*tlen) += f_down->ipf_len; f_down = f->ipf_down; ipf_deq(f); ipf_free(mgr, f); f = f_down; f_down = f->ipf_down; } ipf_free(mgr, f); err = 0; STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_REASSED); ipfq_remque(q); ipfq_free(mgr, q); done: /* ipfm take ownership of, or return assembled packet, if no error */ if (err == 0) { /* reass'ed packet if done; NULL otherwise */ *pkt_ptr = pkt_reassed; } ipfq_sched_timeout(mgr, FALSE); lck_mtx_unlock(&mgr->ipfm_lock); return err; } static int ipf_key_cmp(struct ipf_key *a, struct ipf_key *b) { int d; if ((d = (a->ipfk_len - b->ipfk_len)) != 0) { return d; } if ((d = (a->ipfk_ident - b->ipfk_ident)) != 0) { return d; } return memcmp(a->ipfk_addr, b->ipfk_addr, a->ipfk_len); } /* * Put an ip fragment on a reassembly chain. * Like insque, but pointers in middle of structure. */ static void ipf_enq(struct ipf *f, struct ipf *up6) { f->ipf_up = up6; f->ipf_down = up6->ipf_down; up6->ipf_down->ipf_up = f; up6->ipf_down = f; } /* * To ipf_enq as remque is to insque. */ static void ipf_deq(struct ipf *f) { f->ipf_up->ipf_down = f->ipf_down; f->ipf_down->ipf_up = f->ipf_up; } static void ipfq_insque(struct ipfq *new, struct ipfq *old) { new->ipfq_prev = old; new->ipfq_next = old->ipfq_next; old->ipfq_next->ipfq_prev = new; old->ipfq_next = new; } static void ipfq_remque(struct ipfq *p6) { p6->ipfq_prev->ipfq_next = p6->ipfq_next; p6->ipfq_next->ipfq_prev = p6->ipfq_prev; } /* * @internal drain reassembly queue till reaching target q count. */ static void _ipfq_reap(struct fsw_ip_frag_mgr *mgr, uint32_t target_q_count, void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *)) { uint32_t n_freed = 0; LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED); SK_DF(SK_VERB_IP_FRAG, "draining (frag %d/%d queue %d/%d)", mgr->ipfm_f_count, mgr->ipfm_f_limit, mgr->ipfm_q_count, mgr->ipfm_q_limit); while (mgr->ipfm_q.ipfq_next != &mgr->ipfm_q && mgr->ipfm_q_count > target_q_count) { n_freed += ipfq_freefq(mgr, mgr->ipfm_q.ipfq_prev, mgr->ipfm_q.ipfq_prev->ipfq_is_dirty ? NULL : ipf_cb); } STATS_ADD(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_REAPED, n_freed); } /* * @internal reap half reassembly queues to allow newer fragment assembly. */ static void ipfq_reap(struct fsw_ip_frag_mgr *mgr) { _ipfq_reap(mgr, mgr->ipfm_q_count / 2, ipf_icmp_timeout_err); } /* * @internal reap all reassembly queues, for shutdown etc. */ static void ipfq_drain(struct fsw_ip_frag_mgr *mgr) { _ipfq_reap(mgr, 0, NULL); } static void ipfq_timeout(thread_call_param_t arg0, thread_call_param_t arg1) { #pragma unused(arg1) struct fsw_ip_frag_mgr *__single mgr = arg0; struct ipfq *q; uint64_t now, elapsed; uint32_t n_freed = 0; net_update_uptime(); now = _net_uptime; SK_DF(SK_VERB_IP_FRAG, "run"); lck_mtx_lock(&mgr->ipfm_lock); q = mgr->ipfm_q.ipfq_next; if (q) { while (q != &mgr->ipfm_q) { q = q->ipfq_next; elapsed = now - q->ipfq_prev->ipfq_timestamp; if (elapsed > ipfm_frag_ttl) { SK_DF(SK_VERB_IP_FRAG, "timing out q id %5d", q->ipfq_prev->ipfq_key.ipfk_ident); n_freed = ipfq_freefq(mgr, q->ipfq_prev, q->ipfq_is_dirty ? NULL : ipf_icmp_timeout_err); } } } STATS_ADD(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_TIMEOUT, n_freed); /* If running out of resources, drain ipfm queues (oldest one first) */ if (mgr->ipfm_f_count >= mgr->ipfm_f_limit || mgr->ipfm_q_count >= mgr->ipfm_q_limit) { ipfq_reap(mgr); } /* re-arm the purge timer if there's work to do */ if (mgr->ipfm_q_count > 0) { ipfq_sched_timeout(mgr, TRUE); } lck_mtx_unlock(&mgr->ipfm_lock); } static void ipfq_sched_timeout(struct fsw_ip_frag_mgr *mgr, boolean_t in_tcall) { uint32_t delay = MAX(1, ipfm_timeout_tcall_ival); /* seconds */ thread_call_t __single tcall = mgr->ipfm_timeout_tcall; uint64_t now = mach_absolute_time(); uint64_t ival, deadline = now; LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED); ASSERT(tcall != NULL); if (mgr->ipfm_q_count > 0 && (!thread_call_isactive(tcall) || in_tcall)) { nanoseconds_to_absolutetime(delay * NSEC_PER_SEC, &ival); clock_deadline_for_periodic_event(ival, now, &deadline); (void) thread_call_enter_delayed(tcall, deadline); } } static int ipfq_drain_sysctl SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg2) struct fsw_ip_frag_mgr *__single mgr = arg1; SKOID_PROC_CALL_GUARD; lck_mtx_lock(&mgr->ipfm_lock); ipfq_drain(mgr); lck_mtx_unlock(&mgr->ipfm_lock); return 0; } static struct ipfq * ipfq_alloc(struct fsw_ip_frag_mgr *mgr) { struct ipfq *q; if (mgr->ipfm_q_count > mgr->ipfm_q_limit) { ipfq_reap(mgr); } ASSERT(mgr->ipfm_q_count <= mgr->ipfm_q_limit); q = kalloc_type(struct ipfq, Z_WAITOK | Z_ZERO); if (q != NULL) { mgr->ipfm_q_count++; q->ipfq_is_dirty = false; } return q; } /* free q */ static void ipfq_free(struct fsw_ip_frag_mgr *mgr, struct ipfq *q) { kfree_type(struct ipfq, q); mgr->ipfm_q_count--; } /* * Free all fragments, keep q. * @return: number of frags freed */ static uint32_t ipfq_freef(struct fsw_ip_frag_mgr *mgr, struct ipfq *q, void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *)) { struct ipf *f, *down6; uint32_t nfrags = 0; for (f = q->ipfq_down; f != (struct ipf *)q; f = down6) { nfrags++; down6 = f->ipf_down; ipf_deq(f); if (ipf_cb != NULL) { (*ipf_cb)(mgr, f); } ipf_free_pkt(f); ipf_free(mgr, f); } return nfrags; } /* Free both all fragments and q * @return: number of frags freed */ static uint32_t ipfq_freefq(struct fsw_ip_frag_mgr *mgr, struct ipfq *q, void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *)) { uint32_t freed_count; freed_count = ipfq_freef(mgr, q, ipf_cb); ipfq_remque(q); ipfq_free(mgr, q); return freed_count; } static struct ipf * ipf_alloc(struct fsw_ip_frag_mgr *mgr) { struct ipf *f; if (mgr->ipfm_f_count > mgr->ipfm_f_limit) { STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_FRAG_LIMIT); return NULL; } f = kalloc_type(struct ipf, Z_WAITOK | Z_ZERO); if (f != NULL) { mgr->ipfm_f_count++; } return f; } static void ipf_free_pkt(struct ipf *f) { struct __kern_packet *pkt = f->ipf_pkt; ASSERT(pkt != NULL); pp_free_packet(__DECONST(struct kern_pbufpool *, pkt->pkt_qum.qum_pp), SK_PTR_ADDR(pkt)); } static void ipf_free(struct fsw_ip_frag_mgr *mgr, struct ipf *f) { kfree_type(struct ipf, f); mgr->ipfm_f_count--; } |