Index: Documentation/Configure.help =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/Documentation/Configure.help,v retrieving revision 1.1.1.42 retrieving revision 1.1.1.42.2.1 diff -u -r1.1.1.42 -r1.1.1.42.2.1 --- a/Documentation/Configure.help 14 Apr 2004 13:05:24 -0000 1.1.1.42 +++ b/Documentation/Configure.help 16 Apr 2004 13:16:05 -0000 1.1.1.42.2.1 @@ -5916,6 +5916,14 @@ and you should also say Y to "Kernel/User network link driver", below. If unsure, say N. +PF_KEY sockets +CONFIG_NET_KEY + PF_KEYv2 socket family, compatible to KAME ones. + They are required if you are going to use IPsec tools ported + from KAME. + + Say Y unless you know what you are doing. + TCP/IP networking CONFIG_INET These are the protocols used on the Internet and on most local @@ -6176,6 +6184,32 @@ gated-5). This routing protocol is not used widely, so say N unless you want to play with it. +IP: AH transformation +CONFIG_INET_AH + Support for IPsec AH. + + If unsure, say Y. + +IP: ESP transformation +CONFIG_INET_ESP + Support for IPsec ESP. + + If unsure, say Y. + +IP: IPComp transformation +CONFIG_INET_IPCOMP + Support for IP Paylod Compression (RFC3173), typically needed + for IPsec. + + If unsure, say Y. + +IP: IPsec user configuration interface +CONFIG_XFRM_USER + Support for IPsec user configuration interface used + by native Linux tools. + + If unsure, say Y. + Unix domain sockets CONFIG_UNIX If you say Y here, you will include support for Unix domain sockets; @@ -6221,6 +6255,20 @@ It is safe to say N here for now. +IPv6: Privacy Extensions (RFC 3041) support +CONFIG_IPV6_PRIVACY + Privacy Extensions for Stateless Address Autoconfiguration in IPv6 + support. With this option, additional periodically-alter + pseudo-random global-scope unicast address(es) will assigned to + your interface(s). + + By default, kernel do not generate temporary addresses. + To use temporary addresses, do + + echo 2 >/proc/sys/net/ipv6/conf/all/use_tempaddr + + See for details. + The SCTP Protocol (EXPERIMENTAL) CONFIG_IP_SCTP Stream Control Transmission Protocol Index: Documentation/networking/ip-sysctl.txt =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/Documentation/networking/ip-sysctl.txt,v retrieving revision 1.1.1.17 retrieving revision 1.1.1.17.2.1 diff -u -r1.1.1.17 -r1.1.1.17.2.1 --- a/Documentation/networking/ip-sysctl.txt 14 Apr 2004 13:05:25 -0000 1.1.1.17 +++ b/Documentation/networking/ip-sysctl.txt 16 Apr 2004 13:16:15 -0000 1.1.1.17.2.1 @@ -668,6 +668,37 @@ 0 to disable any limiting, otherwise the maximal rate in jiffies(1) Default: 100 +use_tempaddr - INTEGER + Preference for Privacy Extensions (RFC3041). + <= 0 : disable Privacy Extensions + == 1 : enable Privacy Extensions, but prefer public + addresses over temporary addresses. + > 1 : enable Privacy Extensions and prefer temporary + addresses over public addresses. + Default: 0 (for most devices) + -1 (for point-to-point devices and loopback devices) + +temp_valid_lft - INTEGER + valid lifetime (in seconds) for temporary addresses. + Default: 604800 (7 days) + +temp_prefered_lft - INTEGER + Preferred lifetime (in seconds) for temorary addresses. + Default: 86400 (1 day) + +max_desync_factor - INTEGER + Maximum value for DESYNC_FACTOR, which is a random value + that ensures that clients don't synchronize with each + other and generage new addresses at exactly the same time. + value is in seconds. + Default: 600 + +regen_max_retry - INTEGER + Number of attempts before give up attempting to generate + valid temporary addresses. + Default: 5 + + IPv6 Update by: Pekka Savola YOSHIFUJI Hideaki / USAGI Project Index: arch/alpha/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/alpha/defconfig,v retrieving revision 1.1.1.12 retrieving revision 1.1.1.12.2.1 diff -u -r1.1.1.12 -r1.1.1.12.2.1 --- a/arch/alpha/defconfig 18 Feb 2004 13:36:30 -0000 1.1.1.12 +++ b/arch/alpha/defconfig 16 Apr 2004 13:16:16 -0000 1.1.1.12.2.1 @@ -127,6 +127,7 @@ # CONFIG_NETFILTER_DEBUG is not set # CONFIG_FILTER is not set CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y CONFIG_IP_MULTICAST=y # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/arm/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/arm/defconfig,v retrieving revision 1.1.1.7 retrieving revision 1.1.1.7.2.1 diff -u -r1.1.1.7 -r1.1.1.7.2.1 --- a/arch/arm/defconfig 18 Feb 2004 13:36:30 -0000 1.1.1.7 +++ b/arch/arm/defconfig 16 Apr 2004 13:16:16 -0000 1.1.1.7.2.1 @@ -170,6 +170,7 @@ # CONFIG_NETFILTER is not set # CONFIG_FILTER is not set CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y # CONFIG_IP_MULTICAST is not set # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/cris/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/cris/defconfig,v retrieving revision 1.1.1.11 retrieving revision 1.1.1.11.2.1 diff -u -r1.1.1.11 -r1.1.1.11.2.1 --- a/arch/cris/defconfig 18 Feb 2004 13:36:30 -0000 1.1.1.11 +++ b/arch/cris/defconfig 16 Apr 2004 13:16:16 -0000 1.1.1.11.2.1 @@ -214,6 +214,7 @@ # CONFIG_NETFILTER is not set # CONFIG_FILTER is not set CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y # CONFIG_IP_MULTICAST is not set # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/i386/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/i386/defconfig,v retrieving revision 1.1.1.28 retrieving revision 1.1.1.28.2.1 diff -u -r1.1.1.28 -r1.1.1.28.2.1 --- a/arch/i386/defconfig 14 Apr 2004 13:05:25 -0000 1.1.1.28 +++ b/arch/i386/defconfig 16 Apr 2004 13:16:16 -0000 1.1.1.28.2.1 @@ -178,6 +178,7 @@ # CONFIG_NETFILTER is not set # CONFIG_FILTER is not set CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y CONFIG_IP_MULTICAST=y # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/ia64/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/ia64/defconfig,v retrieving revision 1.1.1.12 retrieving revision 1.1.1.12.2.1 diff -u -r1.1.1.12 -r1.1.1.12.2.1 --- a/arch/ia64/defconfig 18 Feb 2004 13:36:30 -0000 1.1.1.12 +++ b/arch/ia64/defconfig 16 Apr 2004 13:16:16 -0000 1.1.1.12.2.1 @@ -101,6 +101,7 @@ # CONFIG_NETFILTER is not set CONFIG_FILTER=y CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y # CONFIG_IP_MULTICAST is not set # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/m68k/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/m68k/defconfig,v retrieving revision 1.1.1.7 retrieving revision 1.1.1.7.2.1 diff -u -r1.1.1.7 -r1.1.1.7.2.1 --- a/arch/m68k/defconfig 18 Feb 2004 13:36:30 -0000 1.1.1.7 +++ b/arch/m68k/defconfig 16 Apr 2004 13:16:16 -0000 1.1.1.7.2.1 @@ -82,6 +82,7 @@ # CONFIG_NETFILTER is not set # CONFIG_FILTER is not set CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y # CONFIG_IP_MULTICAST is not set # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/mips/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/mips/defconfig,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/arch/mips/defconfig 18 Feb 2004 13:36:30 -0000 1.1.1.15 +++ b/arch/mips/defconfig 16 Apr 2004 13:16:16 -0000 1.1.1.15.2.1 @@ -201,6 +201,7 @@ # CONFIG_NETFILTER is not set # CONFIG_FILTER is not set CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y CONFIG_IP_MULTICAST=y # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/mips64/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/mips64/defconfig,v retrieving revision 1.1.1.19 retrieving revision 1.1.1.19.2.1 diff -u -r1.1.1.19 -r1.1.1.19.2.1 --- a/arch/mips64/defconfig 18 Feb 2004 13:36:30 -0000 1.1.1.19 +++ b/arch/mips64/defconfig 16 Apr 2004 13:16:17 -0000 1.1.1.19.2.1 @@ -199,6 +199,7 @@ # CONFIG_NETFILTER is not set # CONFIG_FILTER is not set CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y CONFIG_IP_MULTICAST=y # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/parisc/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/parisc/defconfig,v retrieving revision 1.1.1.8 retrieving revision 1.1.1.8.2.1 diff -u -r1.1.1.8 -r1.1.1.8.2.1 --- a/arch/parisc/defconfig 18 Feb 2004 13:36:30 -0000 1.1.1.8 +++ b/arch/parisc/defconfig 16 Apr 2004 13:16:17 -0000 1.1.1.8.2.1 @@ -116,6 +116,7 @@ # CONFIG_NETFILTER is not set CONFIG_FILTER=y CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y CONFIG_IP_MULTICAST=y # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/ppc/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/ppc/defconfig,v retrieving revision 1.1.1.17 retrieving revision 1.1.1.17.2.1 diff -u -r1.1.1.17 -r1.1.1.17.2.1 --- a/arch/ppc/defconfig 18 Feb 2004 13:36:30 -0000 1.1.1.17 +++ b/arch/ppc/defconfig 16 Apr 2004 13:16:17 -0000 1.1.1.17.2.1 @@ -134,6 +134,7 @@ # CONFIG_NETFILTER_DEBUG is not set # CONFIG_FILTER is not set CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y CONFIG_IP_MULTICAST=y # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/ppc64/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/ppc64/defconfig,v retrieving revision 1.1.1.8 retrieving revision 1.1.1.8.2.1 diff -u -r1.1.1.8 -r1.1.1.8.2.1 --- a/arch/ppc64/defconfig 18 Feb 2004 13:36:30 -0000 1.1.1.8 +++ b/arch/ppc64/defconfig 16 Apr 2004 13:16:17 -0000 1.1.1.8.2.1 @@ -110,6 +110,7 @@ # CONFIG_NETFILTER is not set CONFIG_FILTER=y CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y CONFIG_IP_MULTICAST=y # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/s390/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/s390/defconfig,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/arch/s390/defconfig 18 Feb 2004 13:36:30 -0000 1.1.1.15 +++ b/arch/s390/defconfig 16 Apr 2004 13:16:17 -0000 1.1.1.15.2.1 @@ -142,6 +142,7 @@ # CONFIG_NETFILTER_DEBUG is not set CONFIG_FILTER=y CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y CONFIG_IP_MULTICAST=y # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/s390x/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/s390x/defconfig,v retrieving revision 1.1.1.13 retrieving revision 1.1.1.13.2.1 diff -u -r1.1.1.13 -r1.1.1.13.2.1 --- a/arch/s390x/defconfig 18 Feb 2004 13:36:30 -0000 1.1.1.13 +++ b/arch/s390x/defconfig 16 Apr 2004 13:16:17 -0000 1.1.1.13.2.1 @@ -142,6 +142,7 @@ # CONFIG_NETFILTER_DEBUG is not set CONFIG_FILTER=y CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y CONFIG_IP_MULTICAST=y # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/sh64/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/sh64/defconfig,v retrieving revision 1.1.1.5 retrieving revision 1.1.1.5.2.1 diff -u -r1.1.1.5 -r1.1.1.5.2.1 --- a/arch/sh64/defconfig 18 Feb 2004 13:36:30 -0000 1.1.1.5 +++ b/arch/sh64/defconfig 16 Apr 2004 13:16:17 -0000 1.1.1.5.2.1 @@ -113,6 +113,7 @@ # CONFIG_NETFILTER is not set # CONFIG_FILTER is not set CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y # CONFIG_IP_MULTICAST is not set # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/sparc/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/sparc/defconfig,v retrieving revision 1.1.1.19 retrieving revision 1.1.1.19.2.1 diff -u -r1.1.1.19 -r1.1.1.19.2.1 --- a/arch/sparc/defconfig 14 Apr 2004 13:05:27 -0000 1.1.1.19 +++ b/arch/sparc/defconfig 16 Apr 2004 13:16:17 -0000 1.1.1.19.2.1 @@ -144,6 +144,7 @@ # CONFIG_NETFILTER is not set # CONFIG_FILTER is not set CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y # CONFIG_IP_MULTICAST is not set # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/sparc64/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/sparc64/defconfig,v retrieving revision 1.1.1.30 retrieving revision 1.1.1.30.2.1 diff -u -r1.1.1.30 -r1.1.1.30.2.1 --- a/arch/sparc64/defconfig 14 Apr 2004 13:05:27 -0000 1.1.1.30 +++ b/arch/sparc64/defconfig 16 Apr 2004 13:16:17 -0000 1.1.1.30.2.1 @@ -189,6 +189,7 @@ # CONFIG_NETFILTER_DEBUG is not set CONFIG_FILTER=y CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y CONFIG_IP_MULTICAST=y # CONFIG_IP_ADVANCED_ROUTER is not set Index: arch/x86_64/defconfig =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/x86_64/defconfig,v retrieving revision 1.1.1.7 retrieving revision 1.1.1.7.2.1 diff -u -r1.1.1.7 -r1.1.1.7.2.1 --- a/arch/x86_64/defconfig 14 Apr 2004 13:05:28 -0000 1.1.1.7 +++ b/arch/x86_64/defconfig 16 Apr 2004 13:16:18 -0000 1.1.1.7.2.1 @@ -144,6 +144,7 @@ # CONFIG_NETFILTER is not set # CONFIG_FILTER is not set CONFIG_UNIX=y +CONFIG_NET_KEY=y CONFIG_INET=y # CONFIG_IP_MULTICAST is not set # CONFIG_IP_ADVANCED_ROUTER is not set Index: crypto/Config.in =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/crypto/Config.in,v retrieving revision 1.1.1.17 retrieving revision 1.1.1.17.2.1 diff -u -r1.1.1.17 -r1.1.1.17.2.1 --- a/crypto/Config.in 14 Apr 2004 13:05:28 -0000 1.1.1.17 +++ b/crypto/Config.in 16 Apr 2004 13:16:18 -0000 1.1.1.17.2.1 @@ -11,7 +11,8 @@ "$CONFIG_INET6_AH" = "y" -o \ "$CONFIG_INET6_AH" = "m" -o \ "$CONFIG_INET6_ESP" = "y" -o \ - "$CONFIG_INET6_ESP" = "m" ]; then + "$CONFIG_INET6_ESP" = "m" -o \ + "$CONFIG_IPV6_PRIVACY" = "y" ]; then define_bool CONFIG_CRYPTO y else bool 'Cryptographic API' CONFIG_CRYPTO @@ -25,7 +26,8 @@ "$CONFIG_INET6_AH" = "y" -o \ "$CONFIG_INET6_AH" = "m" -o \ "$CONFIG_INET6_ESP" = "y" -o \ - "$CONFIG_INET6_ESP" = "m" ]; then + "$CONFIG_INET6_ESP" = "m" -o \ + "$CONFIG_IPV6_PRIVACY" = "y" ]; then define_bool CONFIG_CRYPTO_HMAC y else bool ' HMAC support' CONFIG_CRYPTO_HMAC @@ -33,38 +35,55 @@ tristate ' NULL algorithms' CONFIG_CRYPTO_NULL tristate ' MD4 digest algorithm' CONFIG_CRYPTO_MD4 if [ "$CONFIG_INET_AH" = "y" -o \ - "$CONFIG_INET_AH" = "m" -o \ "$CONFIG_INET_ESP" = "y" -o \ - "$CONFIG_INET_ESP" = "m" -o \ "$CONFIG_INET6_AH" = "y" -o \ - "$CONFIG_INET6_AH" = "m" -o \ - "$CONFIG_INET6_ESP" = "y" -o \ - "$CONFIG_INET6_ESP" = "m" ]; then - define_bool CONFIG_CRYPTO_MD5 y + "$CONFIG_INET6_ESP" = "y" ]; then + define_tristate CONFIG_CRYPTO_MD5 y else - tristate ' MD5 digest algorithm' CONFIG_CRYPTO_MD5 + if [ "$CONFIG_IPV6" = "y" -a "$CONFIG_IPV6_PRIVACY" = "y" ]; then + define_tristate CONFIG_CRYPTO_MD5 y + else + if [ "$CONFIG_INET_AH" = "m" -o \ + "$CONFIG_INET_ESP" = "m" -o \ + "$CONFIG_INET6_AH" = "m" -o \ + "$CONFIG_INET6_ESP" = "m" ]; then + define_tristate CONFIG_CRYPTO_MD5 m + else + if [ "$CONFIG_IPV6" = "m" -a "$CONFIG_IPV6_PRIVACY" = "y" ]; then + define_tristate CONFIG_CRYPTO_MD5 m + else + tristate ' MD5 digest algorithm' CONFIG_CRYPTO_MD5 + fi + fi + fi fi if [ "$CONFIG_INET_AH" = "y" -o \ - "$CONFIG_INET_AH" = "m" -o \ "$CONFIG_INET_ESP" = "y" -o \ - "$CONFIG_INET_ESP" = "m" -o \ "$CONFIG_INET6_AH" = "y" -o \ - "$CONFIG_INET6_AH" = "m" -o \ - "$CONFIG_INET6_ESP" = "y" -o \ - "$CONFIG_INET6_ESP" = "m" ]; then - define_bool CONFIG_CRYPTO_SHA1 y + "$CONFIG_INET6_ESP" = "y" ]; then + define_tristate CONFIG_CRYPTO_SHA1 y else - tristate ' SHA1 digest algorithm' CONFIG_CRYPTO_SHA1 + if [ "$CONFIG_INET_AH" = "m" -o \ + "$CONFIG_INET_ESP" = "m" -o \ + "$CONFIG_INET6_AH" = "m" -o \ + "$CONFIG_INET6_ESP" = "m" ]; then + define_tristate CONFIG_CRYPTO_SHA1 m + else + tristate ' SHA1 digest algorithm' CONFIG_CRYPTO_SHA1 + fi fi tristate ' SHA256 digest algorithm' CONFIG_CRYPTO_SHA256 tristate ' SHA384 and SHA512 digest algorithms' CONFIG_CRYPTO_SHA512 if [ "$CONFIG_INET_ESP" = "y" -o \ - "$CONFIG_INET_ESP" = "m" -o \ - "$CONFIG_INET6_ESP" = "y" -o \ - "$CONFIG_INET6_ESP" = "m" ]; then - define_bool CONFIG_CRYPTO_DES y + "$CONFIG_INET6_ESP" = "y" ]; then + define_tristate CONFIG_CRYPTO_DES y else - tristate ' DES and Triple DES EDE cipher algorithms' CONFIG_CRYPTO_DES + if [ "$CONFIG_INET_ESP" = "m" -o \ + "$CONFIG_INET6_ESP" = "m" ]; then + define_tristate CONFIG_CRYPTO_DES m + else + tristate ' DES and Triple DES EDE cipher algorithms' CONFIG_CRYPTO_DES + fi fi tristate ' Blowfish cipher algorithm' CONFIG_CRYPTO_BLOWFISH tristate ' Twofish cipher algorithm' CONFIG_CRYPTO_TWOFISH @@ -74,12 +93,15 @@ tristate ' CAST6 (CAST-256) cipher algorithm' CONFIG_CRYPTO_CAST6 tristate ' ARC4 cipher algorithm' CONFIG_CRYPTO_ARC4 if [ "$CONFIG_INET_IPCOMP" = "y" -o \ - "$CONFIG_INET_IPCOMP" = "m" -o \ - "$CONFIG_INET6_IPCOMP" = "y" -o \ - "$CONFIG_INET6_IPCOMP" = "m" ]; then - define_bool CONFIG_CRYPTO_DEFLATE y + "$CONFIG_INET6_IPCOMP" = "y" ]; then + define_tristate CONFIG_CRYPTO_DEFLATE y else - tristate ' Deflate compression algorithm' CONFIG_CRYPTO_DEFLATE + if [ "$CONFIG_INET_IPCOMP" = "m" -o \ + "$CONFIG_INET6_IPCOMP" = "m" ]; then + define_tristate CONFIG_CRYPTO_DEFLATE m + else + tristate ' Deflate compression algorithm' CONFIG_CRYPTO_DEFLATE + fi fi tristate ' Testing module' CONFIG_CRYPTO_TEST fi Index: drivers/net/3c59x.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/drivers/net/3c59x.c,v retrieving revision 1.1.1.27 retrieving revision 1.1.1.27.2.1 diff -u -r1.1.1.27 -r1.1.1.27.2.1 --- a/drivers/net/3c59x.c 28 Nov 2003 18:26:20 -0000 1.1.1.27 +++ b/drivers/net/3c59x.c 16 Apr 2004 13:16:18 -0000 1.1.1.27.2.1 @@ -2029,7 +2029,7 @@ if (skb->ip_summed != CHECKSUM_HW) vp->tx_ring[entry].status = cpu_to_le32(skb->len | TxIntrUploaded); else - vp->tx_ring[entry].status = cpu_to_le32(skb->len | TxIntrUploaded | AddTCPChksum); + vp->tx_ring[entry].status = cpu_to_le32(skb->len | TxIntrUploaded | AddTCPChksum | AddUDPChksum); if (!skb_shinfo(skb)->nr_frags) { vp->tx_ring[entry].frag[0].addr = cpu_to_le32(pci_map_single(vp->pdev, skb->data, Index: drivers/net/ppp_generic.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/drivers/net/ppp_generic.c,v retrieving revision 1.1.1.26 retrieving revision 1.1.1.26.2.1 diff -u -r1.1.1.26 -r1.1.1.26.2.1 --- a/drivers/net/ppp_generic.c 25 Aug 2003 11:44:42 -0000 1.1.1.26 +++ b/drivers/net/ppp_generic.c 16 Apr 2004 13:16:18 -0000 1.1.1.26.2.1 @@ -57,7 +57,9 @@ #define NP_IPV6 1 /* Internet Protocol V6 */ #define NP_IPX 2 /* IPX protocol */ #define NP_AT 3 /* Appletalk protocol */ -#define NUM_NP 4 /* Number of NPs. */ +#define NP_MPLS_UC 4 /* MPLS unicast */ +#define NP_MPLS_MC 5 /* MPLS multicast */ +#define NUM_NP 6 /* Number of NPs. */ #define MPHDRLEN 6 /* multilink protocol header length */ #define MPHDRLEN_SSN 4 /* ditto with short sequence numbers */ @@ -281,6 +283,10 @@ return NP_IPX; case PPP_AT: return NP_AT; + case PPP_MPLS_UC: + return NP_MPLS_UC; + case PPP_MPLS_MC: + return NP_MPLS_MC; } return -EINVAL; } @@ -291,6 +297,8 @@ PPP_IPV6, PPP_IPX, PPP_AT, + PPP_MPLS_UC, + PPP_MPLS_MC, }; /* Translates an ethertype into an NP index */ @@ -306,6 +314,10 @@ case ETH_P_PPPTALK: case ETH_P_ATALK: return NP_AT; + case ETH_P_MPLS_UC: + return NP_MPLS_UC; + case ETH_P_MPLS_MC: + return NP_MPLS_MC; } return -1; } @@ -316,6 +328,8 @@ ETH_P_IPV6, ETH_P_IPX, ETH_P_PPPTALK, + ETH_P_MPLS_UC, + ETH_P_MPLS_MC, }; /* Index: include/asm-alpha/scatterlist.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/asm-alpha/scatterlist.h,v retrieving revision 1.1.1.14 retrieving revision 1.1.1.14.2.1 diff -u -r1.1.1.14 -r1.1.1.14.2.1 --- a/include/asm-alpha/scatterlist.h 12 Oct 2001 22:35:54 -0000 1.1.1.14 +++ b/include/asm-alpha/scatterlist.h 16 Apr 2004 13:16:18 -0000 1.1.1.14.2.1 @@ -2,6 +2,7 @@ #define _ALPHA_SCATTERLIST_H #include +#include struct scatterlist { /* This will disappear in 2.5.x */ Index: include/linux/if_arp.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/if_arp.h,v retrieving revision 1.1.1.17 retrieving revision 1.1.1.17.2.1 diff -u -r1.1.1.17 -r1.1.1.17.2.1 --- a/include/linux/if_arp.h 25 Feb 2002 19:38:13 -0000 1.1.1.17 +++ b/include/linux/if_arp.h 16 Apr 2004 13:16:18 -0000 1.1.1.17.2.1 @@ -59,7 +59,7 @@ #define ARPHRD_RAWHDLC 518 /* Raw HDLC */ #define ARPHRD_TUNNEL 768 /* IPIP tunnel */ -#define ARPHRD_TUNNEL6 769 /* IPIP6 tunnel */ +#define ARPHRD_TUNNEL6 769 /* IP6IP6 tunnel */ #define ARPHRD_FRAD 770 /* Frame Relay Access Device */ #define ARPHRD_SKIP 771 /* SKIP vif */ #define ARPHRD_LOOPBACK 772 /* Loopback device */ Index: include/linux/if_ether.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/if_ether.h,v retrieving revision 1.1.1.19 retrieving revision 1.1.1.19.2.1 diff -u -r1.1.1.19 -r1.1.1.19.2.1 --- a/include/linux/if_ether.h 25 Aug 2003 11:44:44 -0000 1.1.1.19 +++ b/include/linux/if_ether.h 16 Apr 2004 13:16:18 -0000 1.1.1.19.2.1 @@ -61,6 +61,8 @@ #define ETH_P_IPV6 0x86DD /* IPv6 over bluebook */ #define ETH_P_PPP_DISC 0x8863 /* PPPoE discovery messages */ #define ETH_P_PPP_SES 0x8864 /* PPPoE session messages */ +#define ETH_P_MPLS_UC 0x8847 /* MPLS Unicast traffic */ +#define ETH_P_MPLS_MC 0x8848 /* MPLS Multicast traffic */ #define ETH_P_ATMMPOA 0x884c /* MultiProtocol Over ATM */ #define ETH_P_ATMFATE 0x8884 /* Frame-based ATM Transport * over Ethernet Index: include/linux/in.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/in.h,v retrieving revision 1.1.1.17 retrieving revision 1.1.1.17.2.1 diff -u -r1.1.1.17 -r1.1.1.17.2.1 --- a/include/linux/in.h 28 Nov 2003 18:26:21 -0000 1.1.1.17 +++ b/include/linux/in.h 16 Apr 2004 13:16:18 -0000 1.1.1.17.2.1 @@ -18,6 +18,7 @@ #ifndef _LINUX_IN_H #define _LINUX_IN_H +#include #include #include @@ -69,6 +70,8 @@ #define IP_RECVTOS 13 #define IP_MTU 14 #define IP_FREEBIND 15 +#define IP_IPSEC_POLICY 16 +#define IP_XFRM_POLICY 17 /* BSD compatibility */ #define IP_RECVRETOPTS IP_RETOPTS Index: include/linux/in6.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/in6.h,v retrieving revision 1.1.1.16 retrieving revision 1.1.1.16.2.1 diff -u -r1.1.1.16 -r1.1.1.16.2.1 --- a/include/linux/in6.h 13 Jun 2003 14:51:38 -0000 1.1.1.16 +++ b/include/linux/in6.h 16 Apr 2004 13:16:18 -0000 1.1.1.16.2.1 @@ -180,5 +180,8 @@ #define IPV6_FLOWLABEL_MGR 32 #define IPV6_FLOWINFO_SEND 33 +#define IPV6_IPSEC_POLICY 34 +#define IPV6_XFRM_POLICY 35 + #endif Index: include/linux/inetdevice.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/inetdevice.h,v retrieving revision 1.1.1.16 retrieving revision 1.1.1.16.2.1 diff -u -r1.1.1.16 -r1.1.1.16.2.1 --- a/include/linux/inetdevice.h 14 Apr 2004 13:05:40 -0000 1.1.1.16 +++ b/include/linux/inetdevice.h 16 Apr 2004 13:16:18 -0000 1.1.1.16.2.1 @@ -21,6 +21,8 @@ int arp_announce; int arp_ignore; int medium_id; + int no_xfrm; + int no_policy; int force_igmp_version; void *sysctl; }; Index: include/linux/ip.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/ip.h,v retrieving revision 1.1.1.13 retrieving revision 1.1.1.13.2.1 diff -u -r1.1.1.13 -r1.1.1.13.2.1 --- a/include/linux/ip.h 22 Nov 2001 19:47:11 -0000 1.1.1.13 +++ b/include/linux/ip.h 16 Apr 2004 13:16:18 -0000 1.1.1.13.2.1 @@ -18,8 +18,6 @@ #define _LINUX_IP_H #include -/* SOL_IP socket options */ - #define IPTOS_TOS_MASK 0x1E #define IPTOS_TOS(tos) ((tos)&IPTOS_TOS_MASK) #define IPTOS_LOWDELAY 0x10 @@ -67,14 +65,6 @@ #define MAXTTL 255 #define IPDEFTTL 64 -/* struct timestamp, struct route and MAX_ROUTES are removed. - - REASONS: it is clear that nobody used them because: - - MAX_ROUTES value was wrong. - - "struct route" was wrong. - - "struct timestamp" had fatally misaligned bitfields and was completely unusable. - */ - #define IPOPT_OPTVAL 0 #define IPOPT_OLEN 1 #define IPOPT_OFFSET 2 @@ -135,4 +125,25 @@ /*The options start here. */ }; +struct ip_auth_hdr { + __u8 nexthdr; + __u8 hdrlen; /* This one is measured in 32 bit units! */ + __u16 reserved; + __u32 spi; + __u32 seq_no; /* Sequence number */ + __u8 auth_data[0]; /* Variable len but >=4. Mind the 64 bit alignment! */ +}; + +struct ip_esp_hdr { + __u32 spi; + __u32 seq_no; /* Sequence number */ + __u8 enc_data[0]; /* Variable len but >=8. Mind the 64 bit alignment! */ +}; + +struct ip_comp_hdr { + __u8 nexthdr; + __u8 flags; + __u16 cpi; +}; + #endif /* _LINUX_IP_H */ Index: include/linux/ip6_tunnel.h =================================================================== RCS file: include/linux/ip6_tunnel.h diff -N include/linux/ip6_tunnel.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/include/linux/ip6_tunnel.h 16 Apr 2004 13:16:18 -0000 1.2.18.1 @@ -0,0 +1,32 @@ +/* + * $Id$ + */ + +#ifndef _IP6_TUNNEL_H +#define _IP6_TUNNEL_H + +#define IPV6_TLV_TNL_ENCAP_LIMIT 4 +#define IPV6_DEFAULT_TNL_ENCAP_LIMIT 4 + +/* don't add encapsulation limit if one isn't present in inner packet */ +#define IP6_TNL_F_IGN_ENCAP_LIMIT 0x1 +/* copy the traffic class field from the inner packet */ +#define IP6_TNL_F_USE_ORIG_TCLASS 0x2 +/* copy the flowlabel from the inner packet */ +#define IP6_TNL_F_USE_ORIG_FLOWLABEL 0x4 +/* being used for Mobile IPv6 */ +#define IP6_TNL_F_MIP6_DEV 0x8 + +struct ip6_tnl_parm { + char name[IFNAMSIZ]; /* name of tunnel device */ + int link; /* ifindex of underlying L2 interface */ + __u8 proto; /* tunnel protocol */ + __u8 encap_limit; /* encapsulation limit for tunnel */ + __u8 hop_limit; /* hop limit for tunnel */ + __u32 flowinfo; /* traffic class and flowlabel for tunnel */ + __u32 flags; /* tunnel flags */ + struct in6_addr laddr; /* local tunnel end-point address */ + struct in6_addr raddr; /* remote tunnel end-point address */ +}; + +#endif Index: include/linux/ipsec.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/ipsec.h,v retrieving revision 1.1.1.14 retrieving revision 1.1.1.14.2.1 diff -u -r1.1.1.14 -r1.1.1.14.2.1 --- a/include/linux/ipsec.h 22 Nov 2001 19:47:15 -0000 1.1.1.14 +++ b/include/linux/ipsec.h 16 Apr 2004 13:16:18 -0000 1.1.1.14.2.1 @@ -1,69 +1,46 @@ -/* - * Definitions for the SECurity layer - * - * Author: - * Robert Muchsel - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - #ifndef _LINUX_IPSEC_H #define _LINUX_IPSEC_H -#include -#include -#include -#include - -/* Values for the set/getsockopt calls */ - -/* These defines are compatible with NRL IPv6, however their semantics - is different */ - -#define IPSEC_LEVEL_NONE -1 /* send plaintext, accept any */ -#define IPSEC_LEVEL_DEFAULT 0 /* encrypt/authenticate if possible */ - /* the default MUST be 0, because a */ - /* socket is initialized with 0's */ -#define IPSEC_LEVEL_USE 1 /* use outbound, don't require inbound */ -#define IPSEC_LEVEL_REQUIRE 2 /* require both directions */ -#define IPSEC_LEVEL_UNIQUE 2 /* for compatibility only */ - -#ifdef __KERNEL__ - -/* skb bit flags set on packet input processing */ - -#define RCV_SEC 0x0f /* options on receive */ -#define RCV_AUTH 0x01 /* was authenticated */ -#define RCV_CRYPT 0x02 /* was encrypted */ -#define RCV_TUNNEL 0x04 /* was tunneled */ -#define SND_SEC 0xf0 /* options on send, these are */ -#define SND_AUTH 0x10 /* currently unused */ -#define SND_CRYPT 0x20 -#define SND_TUNNEL 0x40 - -/* - * FIXME: ignores network encryption for now.. - */ - -#ifdef CONFIG_NET_SECURITY -static __inline__ int ipsec_sk_policy(struct sock *sk, struct sk_buff *skb) -{ - return ((sk->authentication < IPSEC_LEVEL_REQUIRE) || - (skb->security & RCV_AUTH)) && - ((sk->encryption < IPSEC_LEVEL_REQUIRE) || - (skb->security & RCV_CRYPT)); -} - -#else - -static __inline__ int ipsec_sk_policy(struct sock *sk, struct sk_buff *skb) -{ - return 1; -} -#endif /* CONFIG */ +/* The definitions, required to talk to KAME racoon IKE. */ + +#include + +#define IPSEC_PORT_ANY 0 +#define IPSEC_ULPROTO_ANY 255 +#define IPSEC_PROTO_ANY 255 + +enum { + IPSEC_MODE_ANY = 0, /* We do not support this for SA */ + IPSEC_MODE_TRANSPORT = 1, + IPSEC_MODE_TUNNEL = 2 +}; + +enum { + IPSEC_DIR_ANY = 0, + IPSEC_DIR_INBOUND = 1, + IPSEC_DIR_OUTBOUND = 2, + IPSEC_DIR_FWD = 3, /* It is our own */ + IPSEC_DIR_MAX = 4, + IPSEC_DIR_INVALID = 5 +}; + +enum { + IPSEC_POLICY_DISCARD = 0, + IPSEC_POLICY_NONE = 1, + IPSEC_POLICY_IPSEC = 2, + IPSEC_POLICY_ENTRUST = 3, + IPSEC_POLICY_BYPASS = 4 +}; + +enum { + IPSEC_LEVEL_DEFAULT = 0, + IPSEC_LEVEL_USE = 1, + IPSEC_LEVEL_REQUIRE = 2, + IPSEC_LEVEL_UNIQUE = 3 +}; + +#define IPSEC_MANUAL_REQID_MAX 0x3fff + +#define IPSEC_REPLAYWSIZE 32 -#endif /* __KERNEL__ */ #endif /* _LINUX_IPSEC_H */ Index: include/linux/ipv6.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/ipv6.h,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/include/linux/ipv6.h 28 Nov 2003 18:26:21 -0000 1.1.1.15 +++ b/include/linux/ipv6.h 16 Apr 2004 13:16:18 -0000 1.1.1.15.2.1 @@ -73,6 +73,27 @@ #define rt0_type rt_hdr.type }; +struct ipv6_auth_hdr { + __u8 nexthdr; + __u8 hdrlen; /* This one is measured in 32 bit units! */ + __u16 reserved; + __u32 spi; + __u32 seq_no; /* Sequence number */ + __u8 auth_data[0]; /* Length variable but >=4. Mind the 64 bit alignment! */ +}; + +struct ipv6_esp_hdr { + __u32 spi; + __u32 seq_no; /* Sequence number */ + __u8 enc_data[0]; /* Length variable but >=8. Mind the 64 bit alignment! */ +}; + +struct ipv6_comp_hdr { + __u8 nexthdr; + __u8 flags; + __u16 cpi; +}; + /* * IPv6 fixed header * Index: include/linux/ipv6_route.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/ipv6_route.h,v retrieving revision 1.1.1.13 retrieving revision 1.1.1.13.2.1 diff -u -r1.1.1.13 -r1.1.1.13.2.1 --- a/include/linux/ipv6_route.h 28 Nov 2003 18:26:21 -0000 1.1.1.13 +++ b/include/linux/ipv6_route.h 16 Apr 2004 13:16:18 -0000 1.1.1.13.2.1 @@ -13,15 +13,6 @@ #ifndef _LINUX_IPV6_ROUTE_H #define _LINUX_IPV6_ROUTE_H -enum -{ - RTA_IPV6_UNSPEC, - RTA_IPV6_HOPLIMIT, -}; - -#define RTA_IPV6_MAX RTA_IPV6_HOPLIMIT - - #define RTF_DEFAULT 0x00010000 /* default - learned via ND */ #define RTF_ALLONLINK 0x00020000 /* fallback, no routers on link */ #define RTF_ADDRCONF 0x00040000 /* addrconf route - RA */ @@ -33,6 +24,7 @@ #define RTF_CACHE 0x01000000 /* cache entry */ #define RTF_FLOW 0x02000000 /* flow significant route */ #define RTF_POLICY 0x04000000 /* policy route */ +#define RTF_NDISC 0x08000000 /* ndisc route */ #define RTF_LOCAL 0x80000000 Index: include/linux/netdevice.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/netdevice.h,v retrieving revision 1.1.1.27 retrieving revision 1.1.1.27.2.1 diff -u -r1.1.1.27 -r1.1.1.27.2.1 --- a/include/linux/netdevice.h 18 Feb 2004 13:36:32 -0000 1.1.1.27 +++ b/include/linux/netdevice.h 16 Apr 2004 13:16:18 -0000 1.1.1.27.2.1 @@ -95,6 +95,11 @@ #define MAX_HEADER (LL_MAX_HEADER + 48) #endif +/* Reserve 16byte aligned hard_header_len, but at least 16. + * Alternative is: dev->hard_header_len ? (dev->hard_header_len + 15)&~15 : 0 + */ +#define LL_RESERVED_SPACE(dev) (((dev)->hard_header_len&~15) + 16) + /* * Network device statistics. Akin to the 2.0 ether stats but * with byte counters. @@ -494,6 +499,7 @@ extern int dev_queue_xmit(struct sk_buff *skb); extern int register_netdevice(struct net_device *dev); extern int unregister_netdevice(struct net_device *dev); +extern void synchronize_net(void); extern int register_netdevice_notifier(struct notifier_block *nb); extern int unregister_netdevice_notifier(struct notifier_block *nb); extern int dev_new_index(void); Index: include/linux/netlink.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/netlink.h,v retrieving revision 1.1.1.19 retrieving revision 1.1.1.19.2.1 diff -u -r1.1.1.19 -r1.1.1.19.2.1 --- a/include/linux/netlink.h 28 Nov 2002 23:53:15 -0000 1.1.1.19 +++ b/include/linux/netlink.h 16 Apr 2004 13:16:18 -0000 1.1.1.19.2.1 @@ -7,6 +7,7 @@ #define NETLINK_FIREWALL 3 /* Firewalling hook */ #define NETLINK_TCPDIAG 4 /* TCP socket monitoring */ #define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */ +#define NETLINK_XFRM 6 /* ipsec */ #define NETLINK_ARPD 8 #define NETLINK_ROUTE6 11 /* af_inet6 route comm channel */ #define NETLINK_IP6_FW 13 @@ -86,6 +87,8 @@ #ifdef __KERNEL__ +#include + struct netlink_skb_parms { struct ucred creds; /* Skb credentials */ @@ -107,8 +110,8 @@ extern struct sock *netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)); extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock); -extern void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid, - __u32 group, int allocation); +extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid, + __u32 group, int allocation); extern void netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code); extern int netlink_register_notifier(struct notifier_block *nb); extern int netlink_unregister_notifier(struct notifier_block *nb); Index: include/linux/pfkeyv2.h =================================================================== RCS file: include/linux/pfkeyv2.h diff -N include/linux/pfkeyv2.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/include/linux/pfkeyv2.h 16 Apr 2004 13:16:19 -0000 1.4.18.1 @@ -0,0 +1,335 @@ +/* PF_KEY user interface, this is defined by rfc2367 so + * do not make arbitrary modifications or else this header + * file will not be compliant. + */ + +#ifndef _LINUX_PFKEY2_H +#define _LINUX_PFKEY2_H + +#include + +#define PF_KEY_V2 2 +#define PFKEYV2_REVISION 199806L + +struct sadb_msg { + uint8_t sadb_msg_version; + uint8_t sadb_msg_type; + uint8_t sadb_msg_errno; + uint8_t sadb_msg_satype; + uint16_t sadb_msg_len; + uint16_t sadb_msg_reserved; + uint32_t sadb_msg_seq; + uint32_t sadb_msg_pid; +} __attribute__((packed)); +/* sizeof(struct sadb_msg) == 16 */ + +struct sadb_ext { + uint16_t sadb_ext_len; + uint16_t sadb_ext_type; +} __attribute__((packed)); +/* sizeof(struct sadb_ext) == 4 */ + +struct sadb_sa { + uint16_t sadb_sa_len; + uint16_t sadb_sa_exttype; + uint32_t sadb_sa_spi; + uint8_t sadb_sa_replay; + uint8_t sadb_sa_state; + uint8_t sadb_sa_auth; + uint8_t sadb_sa_encrypt; + uint32_t sadb_sa_flags; +} __attribute__((packed)); +/* sizeof(struct sadb_sa) == 16 */ + +struct sadb_lifetime { + uint16_t sadb_lifetime_len; + uint16_t sadb_lifetime_exttype; + uint32_t sadb_lifetime_allocations; + uint64_t sadb_lifetime_bytes; + uint64_t sadb_lifetime_addtime; + uint64_t sadb_lifetime_usetime; +} __attribute__((packed)); +/* sizeof(struct sadb_lifetime) == 32 */ + +struct sadb_address { + uint16_t sadb_address_len; + uint16_t sadb_address_exttype; + uint8_t sadb_address_proto; + uint8_t sadb_address_prefixlen; + uint16_t sadb_address_reserved; +} __attribute__((packed)); +/* sizeof(struct sadb_address) == 8 */ + +struct sadb_key { + uint16_t sadb_key_len; + uint16_t sadb_key_exttype; + uint16_t sadb_key_bits; + uint16_t sadb_key_reserved; +} __attribute__((packed)); +/* sizeof(struct sadb_key) == 8 */ + +struct sadb_ident { + uint16_t sadb_ident_len; + uint16_t sadb_ident_exttype; + uint16_t sadb_ident_type; + uint16_t sadb_ident_reserved; + uint64_t sadb_ident_id; +} __attribute__((packed)); +/* sizeof(struct sadb_ident) == 16 */ + +struct sadb_sens { + uint16_t sadb_sens_len; + uint16_t sadb_sens_exttype; + uint32_t sadb_sens_dpd; + uint8_t sadb_sens_sens_level; + uint8_t sadb_sens_sens_len; + uint8_t sadb_sens_integ_level; + uint8_t sadb_sens_integ_len; + uint32_t sadb_sens_reserved; +} __attribute__((packed)); +/* sizeof(struct sadb_sens) == 16 */ + +/* followed by: + uint64_t sadb_sens_bitmap[sens_len]; + uint64_t sadb_integ_bitmap[integ_len]; */ + +struct sadb_prop { + uint16_t sadb_prop_len; + uint16_t sadb_prop_exttype; + uint8_t sadb_prop_replay; + uint8_t sadb_prop_reserved[3]; +} __attribute__((packed)); +/* sizeof(struct sadb_prop) == 8 */ + +/* followed by: + struct sadb_comb sadb_combs[(sadb_prop_len + + sizeof(uint64_t) - sizeof(struct sadb_prop)) / + sizeof(strut sadb_comb)]; */ + +struct sadb_comb { + uint8_t sadb_comb_auth; + uint8_t sadb_comb_encrypt; + uint16_t sadb_comb_flags; + uint16_t sadb_comb_auth_minbits; + uint16_t sadb_comb_auth_maxbits; + uint16_t sadb_comb_encrypt_minbits; + uint16_t sadb_comb_encrypt_maxbits; + uint32_t sadb_comb_reserved; + uint32_t sadb_comb_soft_allocations; + uint32_t sadb_comb_hard_allocations; + uint64_t sadb_comb_soft_bytes; + uint64_t sadb_comb_hard_bytes; + uint64_t sadb_comb_soft_addtime; + uint64_t sadb_comb_hard_addtime; + uint64_t sadb_comb_soft_usetime; + uint64_t sadb_comb_hard_usetime; +} __attribute__((packed)); +/* sizeof(struct sadb_comb) == 72 */ + +struct sadb_supported { + uint16_t sadb_supported_len; + uint16_t sadb_supported_exttype; + uint32_t sadb_supported_reserved; +} __attribute__((packed)); +/* sizeof(struct sadb_supported) == 8 */ + +/* followed by: + struct sadb_alg sadb_algs[(sadb_supported_len + + sizeof(uint64_t) - sizeof(struct sadb_supported)) / + sizeof(struct sadb_alg)]; */ + +struct sadb_alg { + uint8_t sadb_alg_id; + uint8_t sadb_alg_ivlen; + uint16_t sadb_alg_minbits; + uint16_t sadb_alg_maxbits; + uint16_t sadb_alg_reserved; +} __attribute__((packed)); +/* sizeof(struct sadb_alg) == 8 */ + +struct sadb_spirange { + uint16_t sadb_spirange_len; + uint16_t sadb_spirange_exttype; + uint32_t sadb_spirange_min; + uint32_t sadb_spirange_max; + uint32_t sadb_spirange_reserved; +} __attribute__((packed)); +/* sizeof(struct sadb_spirange) == 16 */ + +struct sadb_x_kmprivate { + uint16_t sadb_x_kmprivate_len; + uint16_t sadb_x_kmprivate_exttype; + u_int32_t sadb_x_kmprivate_reserved; +} __attribute__((packed)); +/* sizeof(struct sadb_x_kmprivate) == 8 */ + +struct sadb_x_sa2 { + uint16_t sadb_x_sa2_len; + uint16_t sadb_x_sa2_exttype; + uint8_t sadb_x_sa2_mode; + uint8_t sadb_x_sa2_reserved1; + uint16_t sadb_x_sa2_reserved2; + uint32_t sadb_x_sa2_sequence; + uint32_t sadb_x_sa2_reqid; +} __attribute__((packed)); +/* sizeof(struct sadb_x_sa2) == 16 */ + +struct sadb_x_policy { + uint16_t sadb_x_policy_len; + uint16_t sadb_x_policy_exttype; + uint16_t sadb_x_policy_type; + uint8_t sadb_x_policy_dir; + uint8_t sadb_x_policy_reserved; + uint32_t sadb_x_policy_id; + uint32_t sadb_x_policy_reserved2; +} __attribute__((packed)); +/* sizeof(struct sadb_x_policy) == 16 */ + +struct sadb_x_ipsecrequest { + uint16_t sadb_x_ipsecrequest_len; + uint16_t sadb_x_ipsecrequest_proto; + uint8_t sadb_x_ipsecrequest_mode; + uint8_t sadb_x_ipsecrequest_level; + uint16_t sadb_x_ipsecrequest_reserved1; + uint32_t sadb_x_ipsecrequest_reqid; + uint32_t sadb_x_ipsecrequest_reserved2; +} __attribute__((packed)); +/* sizeof(struct sadb_x_ipsecrequest) == 16 */ + +/* This defines the TYPE of Nat Traversal in use. Currently only one + * type of NAT-T is supported, draft-ietf-ipsec-udp-encaps-06 + */ +struct sadb_x_nat_t_type { + uint16_t sadb_x_nat_t_type_len; + uint16_t sadb_x_nat_t_type_exttype; + uint8_t sadb_x_nat_t_type_type; + uint8_t sadb_x_nat_t_type_reserved[3]; +} __attribute__((packed)); +/* sizeof(struct sadb_x_nat_t_type) == 8 */ + +/* Pass a NAT Traversal port (Source or Dest port) */ +struct sadb_x_nat_t_port { + uint16_t sadb_x_nat_t_port_len; + uint16_t sadb_x_nat_t_port_exttype; + uint16_t sadb_x_nat_t_port_port; + uint16_t sadb_x_nat_t_port_reserved; +} __attribute__((packed)); +/* sizeof(struct sadb_x_nat_t_port) == 8 */ + +/* Message types */ +#define SADB_RESERVED 0 +#define SADB_GETSPI 1 +#define SADB_UPDATE 2 +#define SADB_ADD 3 +#define SADB_DELETE 4 +#define SADB_GET 5 +#define SADB_ACQUIRE 6 +#define SADB_REGISTER 7 +#define SADB_EXPIRE 8 +#define SADB_FLUSH 9 +#define SADB_DUMP 10 +#define SADB_X_PROMISC 11 +#define SADB_X_PCHANGE 12 +#define SADB_X_SPDUPDATE 13 +#define SADB_X_SPDADD 14 +#define SADB_X_SPDDELETE 15 +#define SADB_X_SPDGET 16 +#define SADB_X_SPDACQUIRE 17 +#define SADB_X_SPDDUMP 18 +#define SADB_X_SPDFLUSH 19 +#define SADB_X_SPDSETIDX 20 +#define SADB_X_SPDEXPIRE 21 +#define SADB_X_SPDDELETE2 22 +#define SADB_X_NAT_T_NEW_MAPPING 23 +#define SADB_MAX 23 + +/* Security Association flags */ +#define SADB_SAFLAGS_PFS 1 +#define SADB_SAFLAGS_NOECN 0x80000000 + +/* Security Association states */ +#define SADB_SASTATE_LARVAL 0 +#define SADB_SASTATE_MATURE 1 +#define SADB_SASTATE_DYING 2 +#define SADB_SASTATE_DEAD 3 +#define SADB_SASTATE_MAX 3 + +/* Security Association types */ +#define SADB_SATYPE_UNSPEC 0 +#define SADB_SATYPE_AH 2 +#define SADB_SATYPE_ESP 3 +#define SADB_SATYPE_RSVP 5 +#define SADB_SATYPE_OSPFV2 6 +#define SADB_SATYPE_RIPV2 7 +#define SADB_SATYPE_MIP 8 +#define SADB_X_SATYPE_IPCOMP 9 +#define SADB_SATYPE_MAX 9 + +/* Authentication algorithms */ +#define SADB_AALG_NONE 0 +#define SADB_AALG_MD5HMAC 2 +#define SADB_AALG_SHA1HMAC 3 +#define SADB_X_AALG_SHA2_256HMAC 5 +#define SADB_X_AALG_SHA2_384HMAC 6 +#define SADB_X_AALG_SHA2_512HMAC 7 +#define SADB_X_AALG_RIPEMD160HMAC 8 +#define SADB_X_AALG_NULL 251 /* kame */ +#define SADB_AALG_MAX 251 + +/* Encryption algorithms */ +#define SADB_EALG_NONE 0 +#define SADB_EALG_DESCBC 2 +#define SADB_EALG_3DESCBC 3 +#define SADB_X_EALG_CASTCBC 6 +#define SADB_X_EALG_BLOWFISHCBC 7 +#define SADB_EALG_NULL 11 +#define SADB_X_EALG_AESCBC 12 +#define SADB_EALG_MAX 253 /* last EALG */ +/* private allocations should use 249-255 (RFC2407) */ +#define SADB_X_EALG_SERPENTCBC 252 /* draft-ietf-ipsec-ciph-aes-cbc-00 */ +#define SADB_X_EALG_TWOFISHCBC 253 /* draft-ietf-ipsec-ciph-aes-cbc-00 */ + +/* Compression algorithms */ +#define SADB_X_CALG_NONE 0 +#define SADB_X_CALG_OUI 1 +#define SADB_X_CALG_DEFLATE 2 +#define SADB_X_CALG_LZS 3 +#define SADB_X_CALG_LZJH 4 +#define SADB_X_CALG_MAX 4 + +/* Extension Header values */ +#define SADB_EXT_RESERVED 0 +#define SADB_EXT_SA 1 +#define SADB_EXT_LIFETIME_CURRENT 2 +#define SADB_EXT_LIFETIME_HARD 3 +#define SADB_EXT_LIFETIME_SOFT 4 +#define SADB_EXT_ADDRESS_SRC 5 +#define SADB_EXT_ADDRESS_DST 6 +#define SADB_EXT_ADDRESS_PROXY 7 +#define SADB_EXT_KEY_AUTH 8 +#define SADB_EXT_KEY_ENCRYPT 9 +#define SADB_EXT_IDENTITY_SRC 10 +#define SADB_EXT_IDENTITY_DST 11 +#define SADB_EXT_SENSITIVITY 12 +#define SADB_EXT_PROPOSAL 13 +#define SADB_EXT_SUPPORTED_AUTH 14 +#define SADB_EXT_SUPPORTED_ENCRYPT 15 +#define SADB_EXT_SPIRANGE 16 +#define SADB_X_EXT_KMPRIVATE 17 +#define SADB_X_EXT_POLICY 18 +#define SADB_X_EXT_SA2 19 +/* The next four entries are for setting up NAT Traversal */ +#define SADB_X_EXT_NAT_T_TYPE 20 +#define SADB_X_EXT_NAT_T_SPORT 21 +#define SADB_X_EXT_NAT_T_DPORT 22 +#define SADB_X_EXT_NAT_T_OA 23 +#define SADB_EXT_MAX 23 + +/* Identity Extension values */ +#define SADB_IDENTTYPE_RESERVED 0 +#define SADB_IDENTTYPE_PREFIX 1 +#define SADB_IDENTTYPE_FQDN 2 +#define SADB_IDENTTYPE_USERFQDN 3 +#define SADB_IDENTTYPE_MAX 3 + +#endif /* !(_LINUX_PFKEY2_H) */ Index: include/linux/ppp_defs.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/ppp_defs.h,v retrieving revision 1.1.1.13 retrieving revision 1.1.1.13.2.1 diff -u -r1.1.1.13 -r1.1.1.13.2.1 --- a/include/linux/ppp_defs.h 13 Mar 2000 05:12:37 -0000 1.1.1.13 +++ b/include/linux/ppp_defs.h 16 Apr 2004 13:16:19 -0000 1.1.1.13.2.1 @@ -74,12 +74,15 @@ #define PPP_IPV6 0x57 /* Internet Protocol Version 6 */ #define PPP_COMPFRAG 0xfb /* fragment compressed below bundle */ #define PPP_COMP 0xfd /* compressed packet */ +#define PPP_MPLS_UC 0x0281 /* Multi Protocol Label Switching - Unicast */ +#define PPP_MPLS_MC 0x0283 /* Multi Protocol Label Switching - Multicast */ #define PPP_IPCP 0x8021 /* IP Control Protocol */ #define PPP_ATCP 0x8029 /* AppleTalk Control Protocol */ #define PPP_IPXCP 0x802b /* IPX Control Protocol */ #define PPP_IPV6CP 0x8057 /* IPv6 Control Protocol */ #define PPP_CCPFRAG 0x80fb /* CCP at link level (below MP bundle) */ #define PPP_CCP 0x80fd /* Compression Control Protocol */ +#define PPP_MPLSCP 0x80fd /* MPLS Control Protocol */ #define PPP_LCP 0xc021 /* Link Control Protocol */ #define PPP_PAP 0xc023 /* Password Authentication Protocol */ #define PPP_LQR 0xc025 /* Link Quality Report protocol */ Index: include/linux/rtnetlink.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/rtnetlink.h,v retrieving revision 1.1.1.21 retrieving revision 1.1.1.21.2.1 diff -u -r1.1.1.21 -r1.1.1.21.2.1 --- a/include/linux/rtnetlink.h 18 Feb 2004 13:36:32 -0000 1.1.1.21 +++ b/include/linux/rtnetlink.h 16 Apr 2004 13:16:19 -0000 1.1.1.21.2.1 @@ -200,10 +200,11 @@ RTA_MULTIPATH, RTA_PROTOINFO, RTA_FLOW, - RTA_CACHEINFO + RTA_CACHEINFO, + RTA_SESSION, }; -#define RTA_MAX RTA_CACHEINFO +#define RTA_MAX RTA_SESSION #define RTM_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtmsg)))) #define RTM_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct rtmsg)) @@ -282,10 +283,39 @@ #define RTAX_ADVMSS RTAX_ADVMSS RTAX_REORDERING, #define RTAX_REORDERING RTAX_REORDERING -}; - -#define RTAX_MAX RTAX_REORDERING + RTAX_HOPLIMIT, +#define RTAX_HOPLIMIT RTAX_HOPLIMIT + RTAX_INITCWND, +#define RTAX_INITCWND RTAX_INITCWND + RTAX_FEATURES, +#define RTAX_FEATURES RTAX_FEATURES +}; + +#define RTAX_MAX RTAX_FEATURES + +#define RTAX_FEATURE_ECN 0x00000001 +#define RTAX_FEATURE_SACK 0x00000002 +#define RTAX_FEATURE_TIMESTAMP 0x00000004 + +struct rta_session +{ + __u8 proto; + + union { + struct { + __u16 sport; + __u16 dport; + } ports; + + struct { + __u8 type; + __u8 code; + __u16 ident; + } icmpt; + __u32 spi; + } u; +}; /********************************************************* @@ -317,6 +347,7 @@ /* ifa_flags */ #define IFA_F_SECONDARY 0x01 +#define IFA_F_TEMPORARY IFA_F_SECONDARY #define IFA_F_DEPRECATED 0x20 #define IFA_F_TENTATIVE 0x40 @@ -575,7 +606,7 @@ extern struct rtnetlink_link * rtnetlink_links[NPROTO]; extern int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb); extern int rtnetlink_send(struct sk_buff *skb, u32 pid, u32 group, int echo); -extern int rtnetlink_put_metrics(struct sk_buff *skb, unsigned *metrics); +extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data); Index: include/linux/skbuff.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/skbuff.h,v retrieving revision 1.1.1.26 retrieving revision 1.1.1.26.2.1 diff -u -r1.1.1.26 -r1.1.1.26.2.1 --- a/include/linux/skbuff.h 14 Apr 2004 13:05:40 -0000 1.1.1.26 +++ b/include/linux/skbuff.h 16 Apr 2004 13:16:19 -0000 1.1.1.26.2.1 @@ -148,6 +148,7 @@ struct icmphdr *icmph; struct igmphdr *igmph; struct iphdr *ipiph; + struct ipv6hdr *ipv6h; struct spxhdr *spxh; unsigned char *raw; } h; @@ -169,7 +170,8 @@ unsigned char *raw; } mac; - struct dst_entry *dst; + struct dst_entry *dst; + struct sec_path *sp; /* * This is the control buffer. It is free to use for every @@ -182,7 +184,7 @@ unsigned int len; /* Length of actual data */ unsigned int data_len; unsigned int csum; /* Checksum */ - unsigned char __unused, /* Dead field, may be reused */ + unsigned char local_df, cloned, /* head may be cloned (check refcnt to be sure). */ pkt_type, /* Packet class */ ip_summed; /* Driver fed us an IP checksum */ @@ -758,6 +760,24 @@ return skb->len - skb->data_len; } +static inline int skb_pagelen(const struct sk_buff *skb) +{ + int i, len = 0; + + for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) + len += skb_shinfo(skb)->frags[i].size; + return len + skb_headlen(skb); +} + +static inline void skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) +{ + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + frag->page = page; + frag->page_offset = off; + frag->size = size; + skb_shinfo(skb)->nr_frags = i+1; +} + #define SKB_PAGE_ASSERT(skb) do { if (skb_shinfo(skb)->nr_frags) out_of_line_bug(); } while (0) #define SKB_FRAG_ASSERT(skb) do { if (skb_shinfo(skb)->frag_list) out_of_line_bug(); } while (0) #define SKB_LINEAR_ASSERT(skb) do { if (skb_is_nonlinear(skb)) out_of_line_bug(); } while (0) Index: include/linux/sysctl.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/sysctl.h,v retrieving revision 1.1.1.26 retrieving revision 1.1.1.26.2.1 diff -u -r1.1.1.26 -r1.1.1.26.2.1 --- a/include/linux/sysctl.h 14 Apr 2004 13:05:40 -0000 1.1.1.26 +++ b/include/linux/sysctl.h 16 Apr 2004 13:16:19 -0000 1.1.1.26.2.1 @@ -361,6 +361,8 @@ NET_IPV4_CONF_TAG=12, NET_IPV4_CONF_ARPFILTER=13, NET_IPV4_CONF_MEDIUM_ID=14, + NET_IPV4_CONF_NOXFRM=15, + NET_IPV4_CONF_NOPOLICY=16, NET_IPV4_CONF_FORCE_IGMP_VERSION=17, NET_IPV4_CONF_ARP_ANNOUNCE=18, NET_IPV4_CONF_ARP_IGNORE=19, @@ -417,7 +419,12 @@ NET_IPV6_DAD_TRANSMITS=7, NET_IPV6_RTR_SOLICITS=8, NET_IPV6_RTR_SOLICIT_INTERVAL=9, - NET_IPV6_RTR_SOLICIT_DELAY=10 + NET_IPV6_RTR_SOLICIT_DELAY=10, + NET_IPV6_USE_TEMPADDR=11, + NET_IPV6_TEMP_VALID_LFT=12, + NET_IPV6_TEMP_PREFERED_LFT=13, + NET_IPV6_REGEN_MAX_RETRY=14, + NET_IPV6_MAX_DESYNC_FACTOR=15 }; /* /proc/sys/net/ipv6/icmp */ Index: include/linux/timer.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/timer.h,v retrieving revision 1.1.1.19 retrieving revision 1.1.1.19.2.1 diff -u -r1.1.1.19 -r1.1.1.19.2.1 --- a/include/linux/timer.h 22 Nov 2001 19:46:19 -0000 1.1.1.19 +++ b/include/linux/timer.h 16 Apr 2004 13:16:19 -0000 1.1.1.19.2.1 @@ -3,6 +3,7 @@ #include #include +#include /* * In Linux 2.4, static timers have been removed from the kernel. Index: include/linux/udp.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/udp.h,v retrieving revision 1.1.1.14 retrieving revision 1.1.1.14.2.1 diff -u -r1.1.1.14 -r1.1.1.14.2.1 --- a/include/linux/udp.h 7 Sep 1997 21:00:24 -0000 1.1.1.14 +++ b/include/linux/udp.h 16 Apr 2004 13:16:19 -0000 1.1.1.14.2.1 @@ -17,6 +17,7 @@ #ifndef _LINUX_UDP_H #define _LINUX_UDP_H +#include struct udphdr { __u16 source; @@ -25,5 +26,11 @@ __u16 check; }; +/* UDP socket options */ +#define UDP_CORK 1 /* Never send partially complete segments */ +#define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */ + +/* UDP encapsulation types */ +#define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-06 */ #endif /* _LINUX_UDP_H */ Index: include/linux/xfrm.h =================================================================== RCS file: include/linux/xfrm.h diff -N include/linux/xfrm.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/include/linux/xfrm.h 16 Apr 2004 13:16:19 -0000 1.6.18.1 @@ -0,0 +1,233 @@ +#ifndef _LINUX_XFRM_H +#define _LINUX_XFRM_H + +#include + +/* All of the structures in this file may not change size as they are + * passed into the kernel from userspace via netlink sockets. + */ + +/* Structure to encapsulate addresses. I do not want to use + * "standard" structure. My apologies. + */ +typedef union +{ + __u32 a4; + __u32 a6[4]; +} xfrm_address_t; + +/* Ident of a specific xfrm_state. It is used on input to lookup + * the state by (spi,daddr,ah/esp) or to store information about + * spi, protocol and tunnel address on output. + */ +struct xfrm_id +{ + xfrm_address_t daddr; + __u32 spi; + __u8 proto; +}; + +/* Selector, used as selector both on policy rules (SPD) and SAs. */ + +struct xfrm_selector +{ + xfrm_address_t daddr; + xfrm_address_t saddr; + __u16 dport; + __u16 dport_mask; + __u16 sport; + __u16 sport_mask; + __u16 family; + __u8 prefixlen_d; + __u8 prefixlen_s; + __u8 proto; + int ifindex; + uid_t user; +}; + +#define XFRM_INF (~(__u64)0) + +struct xfrm_lifetime_cfg +{ + __u64 soft_byte_limit; + __u64 hard_byte_limit; + __u64 soft_packet_limit; + __u64 hard_packet_limit; + __u64 soft_add_expires_seconds; + __u64 hard_add_expires_seconds; + __u64 soft_use_expires_seconds; + __u64 hard_use_expires_seconds; +}; + +struct xfrm_lifetime_cur +{ + __u64 bytes; + __u64 packets; + __u64 add_time; + __u64 use_time; +}; + +struct xfrm_replay_state +{ + __u32 oseq; + __u32 seq; + __u32 bitmap; +}; + +struct xfrm_algo { + char alg_name[64]; + int alg_key_len; /* in bits */ + char alg_key[0]; +}; + +struct xfrm_stats { + __u32 replay_window; + __u32 replay; + __u32 integrity_failed; +}; + +enum +{ + XFRM_POLICY_IN = 0, + XFRM_POLICY_OUT = 1, + XFRM_POLICY_FWD = 2, + XFRM_POLICY_MAX = 3 +}; + +enum +{ + XFRM_SHARE_ANY, /* No limitations */ + XFRM_SHARE_SESSION, /* For this session only */ + XFRM_SHARE_USER, /* For this user only */ + XFRM_SHARE_UNIQUE /* Use once */ +}; + +/* Netlink configuration messages. */ +#define XFRM_MSG_BASE 0x10 + +#define XFRM_MSG_NEWSA (XFRM_MSG_BASE + 0) +#define XFRM_MSG_DELSA (XFRM_MSG_BASE + 1) +#define XFRM_MSG_GETSA (XFRM_MSG_BASE + 2) + +#define XFRM_MSG_NEWPOLICY (XFRM_MSG_BASE + 3) +#define XFRM_MSG_DELPOLICY (XFRM_MSG_BASE + 4) +#define XFRM_MSG_GETPOLICY (XFRM_MSG_BASE + 5) + +#define XFRM_MSG_ALLOCSPI (XFRM_MSG_BASE + 6) +#define XFRM_MSG_ACQUIRE (XFRM_MSG_BASE + 7) +#define XFRM_MSG_EXPIRE (XFRM_MSG_BASE + 8) + +#define XFRM_MSG_UPDPOLICY (XFRM_MSG_BASE + 9) +#define XFRM_MSG_UPDSA (XFRM_MSG_BASE + 10) + +#define XFRM_MSG_POLEXPIRE (XFRM_MSG_BASE + 11) + +#define XFRM_MSG_MAX (XFRM_MSG_POLEXPIRE+1) + +struct xfrm_user_tmpl { + struct xfrm_id id; + __u16 family; + xfrm_address_t saddr; + __u32 reqid; + __u8 mode; + __u8 share; + __u8 optional; + __u32 aalgos; + __u32 ealgos; + __u32 calgos; +}; + +struct xfrm_encap_tmpl { + __u16 encap_type; + __u16 encap_sport; + __u16 encap_dport; + xfrm_address_t encap_oa; +}; + +/* Netlink message attributes. */ +enum xfrm_attr_type_t { + XFRMA_UNSPEC, + XFRMA_ALG_AUTH, /* struct xfrm_algo */ + XFRMA_ALG_CRYPT, /* struct xfrm_algo */ + XFRMA_ALG_COMP, /* struct xfrm_algo */ + XFRMA_ENCAP, /* struct xfrm_algo + struct xfrm_encap_tmpl */ + XFRMA_TMPL, /* 1 or more struct xfrm_user_tmpl */ + +#define XFRMA_MAX XFRMA_TMPL +}; + +struct xfrm_usersa_info { + struct xfrm_selector sel; + struct xfrm_id id; + xfrm_address_t saddr; + struct xfrm_lifetime_cfg lft; + struct xfrm_lifetime_cur curlft; + struct xfrm_stats stats; + __u32 seq; + __u32 reqid; + __u16 family; + __u8 mode; /* 0=transport,1=tunnel */ + __u8 replay_window; + __u8 flags; +#define XFRM_STATE_NOECN 1 +}; + +struct xfrm_usersa_id { + xfrm_address_t daddr; + __u32 spi; + __u16 family; + __u8 proto; +}; + +struct xfrm_userspi_info { + struct xfrm_usersa_info info; + __u32 min; + __u32 max; +}; + +struct xfrm_userpolicy_info { + struct xfrm_selector sel; + struct xfrm_lifetime_cfg lft; + struct xfrm_lifetime_cur curlft; + __u32 priority; + __u32 index; + __u8 dir; + __u8 action; +#define XFRM_POLICY_ALLOW 0 +#define XFRM_POLICY_BLOCK 1 + __u8 flags; +#define XFRM_POLICY_LOCALOK 1 /* Allow user to override global policy */ + __u8 share; +}; + +struct xfrm_userpolicy_id { + struct xfrm_selector sel; + __u32 index; + __u8 dir; +}; + +struct xfrm_user_acquire { + struct xfrm_id id; + xfrm_address_t saddr; + struct xfrm_selector sel; + struct xfrm_userpolicy_info policy; + __u32 aalgos; + __u32 ealgos; + __u32 calgos; + __u32 seq; +}; + +struct xfrm_user_expire { + struct xfrm_usersa_info state; + __u8 hard; +}; + +struct xfrm_user_polexpire { + struct xfrm_userpolicy_info pol; + __u8 hard; +}; + +#define XFRMGRP_ACQUIRE 1 +#define XFRMGRP_EXPIRE 2 + +#endif /* _LINUX_XFRM_H */ Index: include/net/addrconf.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/addrconf.h,v retrieving revision 1.1.1.17 retrieving revision 1.1.1.17.2.1 diff -u -r1.1.1.17 -r1.1.1.17.2.1 --- a/include/net/addrconf.h 25 Aug 2003 11:44:44 -0000 1.1.1.17 +++ b/include/net/addrconf.h 16 Apr 2004 13:16:19 -0000 1.1.1.17.2.1 @@ -6,6 +6,13 @@ #define MAX_RTR_SOLICITATIONS 3 #define RTR_SOLICITATION_INTERVAL (4*HZ) +#define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ + +#define TEMP_VALID_LIFETIME (7*86400) +#define TEMP_PREFERRED_LIFETIME (86400) +#define REGEN_MAX_RETRY (5) +#define MAX_DESYNC_FACTOR (600) + #define ADDR_CHECK_FREQUENCY (120*HZ) struct prefix_info { Index: include/net/ah.h =================================================================== RCS file: include/net/ah.h diff -N include/net/ah.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/include/net/ah.h 16 Apr 2004 13:16:19 -0000 1.3.2.1 @@ -0,0 +1,35 @@ +#ifndef _NET_AH_H +#define _NET_AH_H + +#include + +/* This is the maximum truncated ICV length that we know of. */ +#define MAX_AH_AUTH_LEN 12 + +struct ah_data +{ + u8 *key; + int key_len; + u8 *work_icv; + int icv_full_len; + int icv_trunc_len; + + void (*icv)(struct ah_data*, + struct sk_buff *skb, u8 *icv); + + struct crypto_tfm *tfm; +}; + +static inline void +ah_hmac_digest(struct ah_data *ahp, struct sk_buff *skb, u8 *auth_data) +{ + struct crypto_tfm *tfm = ahp->tfm; + + memset(auth_data, 0, ahp->icv_trunc_len); + crypto_hmac_init(tfm, ahp->key, &ahp->key_len); + skb_icv_walk(skb, tfm, 0, skb->len, crypto_hmac_update); + crypto_hmac_final(tfm, ahp->key, &ahp->key_len, ahp->work_icv); + memcpy(auth_data, ahp->work_icv, ahp->icv_trunc_len); +} + +#endif Index: include/net/dn_fib.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/dn_fib.h,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/include/net/dn_fib.h 21 Dec 2001 17:42:04 -0000 1.1.1.15 +++ b/include/net/dn_fib.h 16 Apr 2004 13:16:19 -0000 1.1.1.15.2.1 @@ -7,6 +7,9 @@ #include +/* WARNING: The ordering of these elements must match ordering + * of RTA_* rtnetlink attribute numbers. + */ struct dn_kern_rta { void *rta_dst; @@ -19,8 +22,9 @@ struct rtattr *rta_mx; struct rtattr *rta_mp; unsigned char *rta_protoinfo; - unsigned char *rta_flow; + u32 *rta_flow; struct rta_cacheinfo *rta_ci; + struct rta_session *rta_sess; }; struct dn_fib_key { Index: include/net/dn_route.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/dn_route.h,v retrieving revision 1.1.1.14 retrieving revision 1.1.1.14.2.1 diff -u -r1.1.1.14 -r1.1.1.14.2.1 --- a/include/net/dn_route.h 11 Dec 2000 21:33:56 -0000 1.1.1.14 +++ b/include/net/dn_route.h 16 Apr 2004 13:16:19 -0000 1.1.1.14.2.1 @@ -122,7 +122,7 @@ if ((dst = sk->dst_cache) && !dst->obsolete) { try_again: skb->dst = dst_clone(dst); - dst->output(skb); + dst_output(skb); return; } Index: include/net/dst.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/dst.h,v retrieving revision 1.1.1.18 retrieving revision 1.1.1.18.2.1 diff -u -r1.1.1.18 -r1.1.1.18.2.1 --- a/include/net/dst.h 25 Aug 2003 11:44:44 -0000 1.1.1.18 +++ b/include/net/dst.h 16 Apr 2004 13:16:19 -0000 1.1.1.18.2.1 @@ -9,6 +9,8 @@ #define _NET_DST_H #include +#include +#include #include /* @@ -22,6 +24,13 @@ #define DST_GC_INC (HZ/2) #define DST_GC_MAX (120*HZ) +/* Each dst_entry has reference count and sits in some parent list(s). + * When it is removed from parent list, it is "freed" (dst_free). + * After this it enters dead state (dst->obsolete > 0) and if its refcnt + * is zero, it can be destroyed immediately, otherwise it is added + * to gc list and garbage collector periodically checks the refcnt. + */ + struct sk_buff; struct dst_entry @@ -29,22 +38,22 @@ struct dst_entry *next; atomic_t __refcnt; /* client references */ int __use; + struct dst_entry *child; struct net_device *dev; int obsolete; int flags; #define DST_HOST 1 +#define DST_NOXFRM 2 +#define DST_NOPOLICY 4 +#define DST_NOHASH 8 unsigned long lastuse; unsigned long expires; - unsigned mxlock; - unsigned pmtu; - unsigned window; - unsigned rtt; - unsigned rttvar; - unsigned ssthresh; - unsigned cwnd; - unsigned advmss; - unsigned reordering; + unsigned short header_len; /* more space at head required */ + unsigned short trailer_len; /* space to reserve at tail */ + + u32 metrics[RTAX_MAX]; + struct dst_entry *path; unsigned long rate_last; /* rate limiting for ICMP */ unsigned long rate_tokens; @@ -53,6 +62,7 @@ struct neighbour *neighbour; struct hh_cache *hh; + struct xfrm_state *xfrm; int (*input)(struct sk_buff*); int (*output)(struct sk_buff*); @@ -75,11 +85,11 @@ int (*gc)(void); struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); - struct dst_entry * (*reroute)(struct dst_entry *, - struct sk_buff *); void (*destroy)(struct dst_entry *); struct dst_entry * (*negative_advice)(struct dst_entry *); void (*link_failure)(struct sk_buff *); + void (*update_pmtu)(struct dst_entry *dst, u32 mtu); + int (*get_mss)(struct dst_entry *dst, u32 mtu); int entry_size; atomic_t entries; @@ -88,6 +98,33 @@ #ifdef __KERNEL__ +static inline u32 +dst_metric(struct dst_entry *dst, int metric) +{ + return dst->metrics[metric-1]; +} + +static inline u32 +dst_path_metric(struct dst_entry *dst, int metric) +{ + return dst->path->metrics[metric-1]; +} + +static inline u32 +dst_pmtu(struct dst_entry *dst) +{ + u32 mtu = dst_path_metric(dst, RTAX_MTU); + /* Yes, _exactly_. This is paranoia. */ + barrier(); + return mtu; +} + +static inline int +dst_metric_locked(struct dst_entry *dst, int metric) +{ + return dst_metric(dst, RTAX_LOCK) & (1<__refcnt); @@ -104,22 +141,40 @@ static inline void dst_release(struct dst_entry * dst) { - if (dst) + if (dst) { + if (atomic_read(&dst->__refcnt) < 1) { + printk("BUG: dst underflow %d: %p\n", + atomic_read(&dst->__refcnt), + current_text_addr()); + } atomic_dec(&dst->__refcnt); + } +} + +/* Children define the path of the packet through the + * Linux networking. Thus, destinations are stackable. + */ + +static inline struct dst_entry *dst_pop(struct dst_entry *dst) +{ + struct dst_entry *child = dst_clone(dst->child); + + dst_release(dst); + return child; } extern void * dst_alloc(struct dst_ops * ops); extern void __dst_free(struct dst_entry * dst); -extern void dst_destroy(struct dst_entry * dst); +extern struct dst_entry *dst_destroy(struct dst_entry * dst); -static inline -void dst_free(struct dst_entry * dst) +static inline void dst_free(struct dst_entry * dst) { if (dst->obsolete > 1) return; if (!atomic_read(&dst->__refcnt)) { - dst_destroy(dst); - return; + dst = dst_destroy(dst); + if (!dst) + return; } __dst_free(dst); } @@ -155,8 +210,50 @@ dst->expires = expires; } +/* Output packet to network from transport. */ +static inline int dst_output(struct sk_buff *skb) +{ + int err; + + for (;;) { + err = skb->dst->output(skb); + + if (likely(err == 0)) + return err; + if (unlikely(err != NET_XMIT_BYPASS)) + return err; + } +} + +/* Input packet from network to transport. */ +static inline int dst_input(struct sk_buff *skb) +{ + int err; + + for (;;) { + err = skb->dst->input(skb); + + if (likely(err == 0)) + return err; + /* Oh, Jamal... Seems, I will not forgive you this mess. :-) */ + if (unlikely(err != NET_XMIT_BYPASS)) + return err; + } +} + extern void dst_init(void); +struct flowi; +#ifndef CONFIG_XFRM +static inline int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, + struct sock *sk, int flags) +{ + return 0; +} +#else +extern int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, + struct sock *sk, int flags); +#endif #endif #endif /* _NET_DST_H */ Index: include/net/esp.h =================================================================== RCS file: include/net/esp.h diff -N include/net/esp.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/include/net/esp.h 16 Apr 2004 13:16:19 -0000 1.2.18.1 @@ -0,0 +1,54 @@ +#ifndef _NET_ESP_H +#define _NET_ESP_H + +#include + +struct esp_data +{ + /* Confidentiality */ + struct { + u8 *key; /* Key */ + int key_len; /* Key length */ + u8 *ivec; /* ivec buffer */ + /* ivlen is offset from enc_data, where encrypted data start. + * It is logically different of crypto_tfm_alg_ivsize(tfm). + * We assume that it is either zero (no ivec), or + * >= crypto_tfm_alg_ivsize(tfm). */ + int ivlen; + int padlen; /* 0..255 */ + struct crypto_tfm *tfm; /* crypto handle */ + } conf; + + /* Integrity. It is active when icv_full_len != 0 */ + struct { + u8 *key; /* Key */ + int key_len; /* Length of the key */ + u8 *work_icv; + int icv_full_len; + int icv_trunc_len; + void (*icv)(struct esp_data*, + struct sk_buff *skb, + int offset, int len, u8 *icv); + struct crypto_tfm *tfm; + } auth; +}; + +extern int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); +extern int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer); +extern void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len); + +static inline void +esp_hmac_digest(struct esp_data *esp, struct sk_buff *skb, int offset, + int len, u8 *auth_data) +{ + struct crypto_tfm *tfm = esp->auth.tfm; + char *icv = esp->auth.work_icv; + + memset(auth_data, 0, esp->auth.icv_trunc_len); + crypto_hmac_init(tfm, esp->auth.key, &esp->auth.key_len); + skb_icv_walk(skb, tfm, offset, len, crypto_hmac_update); + crypto_hmac_final(tfm, esp->auth.key, &esp->auth.key_len, icv); + memcpy(auth_data, icv, esp->auth.icv_trunc_len); +} + +#endif Index: include/net/flow.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/flow.h,v retrieving revision 1.1.1.16 retrieving revision 1.1.1.16.2.1 diff -u -r1.1.1.16 -r1.1.1.16.2.1 --- a/include/net/flow.h 23 Apr 1999 02:45:19 -0000 1.1.1.16 +++ b/include/net/flow.h 16 Apr 2004 13:16:19 -0000 1.1.1.16.2.1 @@ -1,24 +1,31 @@ /* * - * Flow based forwarding rules (usage: firewalling, etc) + * Generic internet FLOW. * */ #ifndef _NET_FLOW_H #define _NET_FLOW_H +#include +#include + struct flowi { - int proto; /* {TCP, UDP, ICMP} */ + int oif; + int iif; union { struct { __u32 daddr; __u32 saddr; + __u32 fwmark; + __u8 tos; + __u8 scope; } ip4_u; struct { - struct in6_addr * daddr; - struct in6_addr * saddr; + struct in6_addr daddr; + struct in6_addr saddr; __u32 flowlabel; } ip6_u; } nl_u; @@ -27,9 +34,12 @@ #define fl6_flowlabel nl_u.ip6_u.flowlabel #define fl4_dst nl_u.ip4_u.daddr #define fl4_src nl_u.ip4_u.saddr +#define fl4_fwmark nl_u.ip4_u.fwmark +#define fl4_tos nl_u.ip4_u.tos +#define fl4_scope nl_u.ip4_u.scope - int oif; - + __u8 proto; + __u8 flags; union { struct { __u16 sport; @@ -41,61 +51,27 @@ __u8 code; } icmpt; - unsigned long data; + __u32 spi; } uli_u; -}; - -#define FLOWR_NODECISION 0 /* rule not appliable to flow */ -#define FLOWR_SELECT 1 /* flow must follow this rule */ -#define FLOWR_CLEAR 2 /* priority level clears flow */ -#define FLOWR_ERROR 3 - -struct fl_acc_args { - int type; - - -#define FL_ARG_FORWARD 1 -#define FL_ARG_ORIGIN 2 - - union { - struct sk_buff *skb; - struct { - struct sock *sk; - struct flowi *flow; - } fl_o; - } fl_u; -}; - - -struct pkt_filter { - atomic_t refcnt; - unsigned int offset; - __u32 value; - __u32 mask; - struct pkt_filter *next; -}; - -#define FLR_INPUT 1 -#define FLR_OUTPUT 2 - -struct flow_filter { - int type; - union { - struct pkt_filter *filter; - struct sock *sk; - } u; -}; - -struct flow_rule { - struct flow_rule_ops *ops; - unsigned char private[0]; -}; - -struct flow_rule_ops { - int (*accept)(struct rt6_info *rt, - struct rt6_info *rule, - struct fl_acc_args *args, - struct rt6_info **nrt); -}; +#define fl_ip_sport uli_u.ports.sport +#define fl_ip_dport uli_u.ports.dport +#define fl_icmp_type uli_u.icmpt.type +#define fl_icmp_code uli_u.icmpt.code +#define fl_ipsec_spi uli_u.spi + + u32 __pad; +} __attribute__((__aligned__(BITS_PER_LONG/8))); + +#define FLOW_DIR_IN 0 +#define FLOW_DIR_OUT 1 +#define FLOW_DIR_FWD 2 + +typedef void (*flow_resolve_t)(struct flowi *key, u16 family, u8 dir, + void **objp, atomic_t **obj_refp); + +extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, + flow_resolve_t resolver); +extern void flow_cache_flush(void); +extern atomic_t flow_cache_genid; #endif Index: include/net/if_inet6.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/if_inet6.h,v retrieving revision 1.1.1.14 retrieving revision 1.1.1.14.2.1 diff -u -r1.1.1.14 -r1.1.1.14.2.1 --- a/include/net/if_inet6.h 28 Nov 2003 18:26:21 -0000 1.1.1.14 +++ b/include/net/if_inet6.h 16 Apr 2004 13:16:19 -0000 1.1.1.14.2.1 @@ -47,6 +47,12 @@ struct inet6_ifaddr *lst_next; /* next addr in addr_lst */ struct inet6_ifaddr *if_next; /* next addr in inet6_dev */ +#ifdef CONFIG_IPV6_PRIVACY + struct inet6_ifaddr *tmp_next; /* next addr in tempaddr_lst */ + struct inet6_ifaddr *ifpub; + int regen_count; +#endif + int dead; }; @@ -151,6 +157,15 @@ __u32 if_flags; int dead; +#ifdef CONFIG_IPV6_PRIVACY + u8 rndid[8]; + u8 entropy[8]; + struct timer_list regen_timer; + struct inet6_ifaddr *tempaddr_list; + __u8 work_eui64[8]; + __u8 work_digest[16]; +#endif + struct neigh_parms *nd_parms; struct inet6_dev *next; struct ipv6_devconf cnf; Index: include/net/inet_ecn.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/inet_ecn.h,v retrieving revision 1.1.1.12 retrieving revision 1.1.1.12.2.1 diff -u -r1.1.1.12 -r1.1.1.12.2.1 --- a/include/net/inet_ecn.h 30 Oct 2001 23:08:12 -0000 1.1.1.12 +++ b/include/net/inet_ecn.h 16 Apr 2004 13:16:19 -0000 1.1.1.12.2.1 @@ -1,6 +1,8 @@ #ifndef _INET_ECN_H_ #define _INET_ECN_H_ +#include + static inline int INET_ECN_is_ce(__u8 dsfield) { return (dsfield&3) == 3; @@ -44,6 +46,11 @@ iph->tos |= 1; } +static inline void IP_ECN_clear(struct iphdr *iph) +{ + iph->tos &= ~3; +} + struct ipv6hdr; static inline void IP6_ECN_set_ce(struct ipv6hdr *iph) @@ -51,6 +58,11 @@ *(u32*)iph |= htonl(1<<20); } +static inline void IP6_ECN_clear(struct ipv6hdr *iph) +{ + *(u32*)iph &= ~htonl(3<<20); +} + #define ip6_get_dsfield(iph) ((ntohs(*(u16*)(iph)) >> 4) & 0xFF) #endif Index: include/net/ip.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/ip.h,v retrieving revision 1.1.1.17 retrieving revision 1.1.1.17.2.1 diff -u -r1.1.1.17 -r1.1.1.17.2.1 --- a/include/net/ip.h 28 Nov 2003 18:26:21 -0000 1.1.1.17 +++ b/include/net/ip.h 16 Apr 2004 13:16:19 -0000 1.1.1.17.2.1 @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -46,6 +47,7 @@ #define IPSKB_MASQUERADED 1 #define IPSKB_TRANSLATED 2 #define IPSKB_FORWARDED 4 +#define IPSKB_XFRM_TUNNEL_SIZE 8 }; struct ipcm_cookie @@ -98,16 +100,19 @@ extern void ip_send_check(struct iphdr *ip); extern int ip_queue_xmit(struct sk_buff *skb, int ipfragok); extern void ip_init(void); -extern int ip_build_xmit(struct sock *sk, - int getfrag (const void *, - char *, - unsigned int, - unsigned int), - const void *frag, - unsigned length, - struct ipcm_cookie *ipc, - struct rtable *rt, - int flags); +extern int ip_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int len, int protolen, + struct ipcm_cookie *ipc, + struct rtable *rt, + unsigned int flags); +extern int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb); +extern ssize_t ip_append_page(struct sock *sk, struct page *page, + int offset, size_t size, int flags); +extern int ip_push_pending_frames(struct sock *sk); +extern void ip_flush_pending_frames(struct sock *sk); + /* * Map a multicast IP onto multicast MAC for type Token Ring. @@ -127,8 +132,7 @@ } struct ip_reply_arg { - struct iovec iov[2]; - int n_iov; /* redundant */ + struct iovec iov[1]; u32 csum; int csumoffset; /* u16 offset of csum in iov[0].iov_base */ /* -1 if not needed */ @@ -160,14 +164,6 @@ extern int sysctl_ip_default_ttl; #ifdef CONFIG_INET -static inline int ip_send(struct sk_buff *skb) -{ - if (skb->len > skb->dst->pmtu) - return ip_fragment(skb, ip_finish_output); - else - return ip_finish_output(skb); -} - /* The function in 2.2 was invalid, producing wrong result for * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */ static inline @@ -184,7 +180,7 @@ { return (sk->protinfo.af_inet.pmtudisc == IP_PMTUDISC_DO || (sk->protinfo.af_inet.pmtudisc == IP_PMTUDISC_WANT && - !(dst->mxlock&(1< - -struct ip6_fw_rule { - struct flow_rule flowr; - struct ip6_fw_rule *next; - struct ip6_fw_rule *prev; - struct flowi info; - unsigned long policy; -}; - -#endif - -#endif Index: include/net/ip6_route.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/ip6_route.h,v retrieving revision 1.1.1.17 retrieving revision 1.1.1.17.2.1 diff -u -r1.1.1.17 -r1.1.1.17.2.1 --- a/include/net/ip6_route.h 13 Jun 2003 14:51:39 -0000 1.1.1.17 +++ b/include/net/ip6_route.h 16 Apr 2004 13:16:19 -0000 1.1.1.17.2.1 @@ -39,12 +39,15 @@ extern int ipv6_route_ioctl(unsigned int cmd, void *arg); extern int ip6_route_add(struct in6_rtmsg *rtmsg, - struct nlmsghdr *); + struct nlmsghdr *, + void *rtattr); extern int ip6_del_rt(struct rt6_info *, - struct nlmsghdr *); + struct nlmsghdr *, + void *rtattr); extern int ip6_rt_addr_add(struct in6_addr *addr, - struct net_device *dev); + struct net_device *dev, + int anycast); extern int ip6_rt_addr_del(struct in6_addr *addr, struct net_device *dev); @@ -60,6 +63,12 @@ struct in6_addr *saddr, int oif, int flags); +extern struct dst_entry *ndisc_dst_alloc(struct net_device *dev, + struct neighbour *neigh, + int (*output)(struct sk_buff *)); +extern int ndisc_dst_gc(int *more); +extern void fib6_force_start_gc(void); + /* * support functions for ND * @@ -111,5 +120,12 @@ write_unlock(&sk->dst_lock); } +static inline int ipv6_unicast_destination(struct sk_buff *skb) +{ + struct rt6_info *rt = (struct rt6_info *) skb->dst; + + return rt->rt6i_flags & RTF_LOCAL; +} + #endif #endif Index: include/net/ip6_tunnel.h =================================================================== RCS file: include/net/ip6_tunnel.h diff -N include/net/ip6_tunnel.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/include/net/ip6_tunnel.h 16 Apr 2004 13:16:19 -0000 1.2.18.1 @@ -0,0 +1,44 @@ +/* + * $Id$ + */ + +#ifndef _NET_IP6_TUNNEL_H +#define _NET_IP6_TUNNEL_H + +#include +#include +#include + +/* capable of sending packets */ +#define IP6_TNL_F_CAP_XMIT 0x10000 +/* capable of receiving packets */ +#define IP6_TNL_F_CAP_RCV 0x20000 + +#define IP6_TNL_MAX 128 + +/* IPv6 tunnel */ + +struct ip6_tnl { + struct ip6_tnl *next; /* next tunnel in list */ + struct net_device *dev; /* virtual device associated with tunnel */ + struct net_device_stats stat; /* statistics for tunnel device */ + int recursion; /* depth of hard_start_xmit recursion */ + struct ip6_tnl_parm parms; /* tunnel configuration paramters */ + struct flowi fl; /* flowi template for xmit */ +}; + +/* Tunnel encapsulation limit destination sub-option */ + +struct ipv6_tlv_tnl_enc_lim { + __u8 type; /* type-code for option */ + __u8 length; /* option length */ + __u8 encap_limit; /* tunnel encapsulation limit */ +} __attribute__ ((packed)); + +#ifdef __KERNEL__ +#ifdef CONFIG_IPV6_TUNNEL +extern int __init ip6_tunnel_init(void); +extern void ip6_tunnel_cleanup(void); +#endif +#endif +#endif Index: include/net/ip_fib.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/ip_fib.h,v retrieving revision 1.1.1.16 retrieving revision 1.1.1.16.2.1 diff -u -r1.1.1.16 -r1.1.1.16.2.1 --- a/include/net/ip_fib.h 9 Feb 2001 19:34:13 -0000 1.1.1.16 +++ b/include/net/ip_fib.h 16 Apr 2004 13:16:19 -0000 1.1.1.16.2.1 @@ -17,7 +17,11 @@ #define _NET_IP_FIB_H #include +#include +/* WARNING: The ordering of these elements must match ordering + * of RTA_* rtnetlink attribute numbers. + */ struct kern_rta { void *rta_dst; @@ -30,8 +34,9 @@ struct rtattr *rta_mx; struct rtattr *rta_mp; unsigned char *rta_protoinfo; - unsigned char *rta_flow; + u32 *rta_flow; struct rta_cacheinfo *rta_ci; + struct rta_session *rta_sess; }; struct fib_nh @@ -65,7 +70,7 @@ int fib_protocol; u32 fib_prefsrc; u32 fib_priority; - unsigned fib_metrics[RTAX_MAX]; + u32 fib_metrics[RTAX_MAX]; #define fib_mtu fib_metrics[RTAX_MTU-1] #define fib_window fib_metrics[RTAX_WINDOW-1] #define fib_rtt fib_metrics[RTAX_RTT-1] @@ -117,7 +122,7 @@ { unsigned char tb_id; unsigned tb_stamp; - int (*tb_lookup)(struct fib_table *tb, const struct rt_key *key, struct fib_result *res); + int (*tb_lookup)(struct fib_table *tb, const struct flowi *flp, struct fib_result *res); int (*tb_insert)(struct fib_table *table, struct rtmsg *r, struct kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req); @@ -130,7 +135,7 @@ int (*tb_get_info)(struct fib_table *table, char *buf, int first, int count); void (*tb_select_default)(struct fib_table *table, - const struct rt_key *key, struct fib_result *res); + const struct flowi *flp, struct fib_result *res); unsigned char tb_data[0]; }; @@ -152,18 +157,18 @@ return fib_get_table(id); } -static inline int fib_lookup(const struct rt_key *key, struct fib_result *res) +static inline int fib_lookup(const struct flowi *flp, struct fib_result *res) { - if (local_table->tb_lookup(local_table, key, res) && - main_table->tb_lookup(main_table, key, res)) + if (local_table->tb_lookup(local_table, flp, res) && + main_table->tb_lookup(main_table, flp, res)) return -ENETUNREACH; return 0; } -static inline void fib_select_default(const struct rt_key *key, struct fib_result *res) +static inline void fib_select_default(const struct flowi *flp, struct fib_result *res) { if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - main_table->tb_select_default(main_table, key, res); + main_table->tb_select_default(main_table, flp, res); } #else /* CONFIG_IP_MULTIPLE_TABLES */ @@ -171,7 +176,7 @@ #define main_table (fib_tables[RT_TABLE_MAIN]) extern struct fib_table * fib_tables[RT_TABLE_MAX+1]; -extern int fib_lookup(const struct rt_key *key, struct fib_result *res); +extern int fib_lookup(const struct flowi *flp, struct fib_result *res); extern struct fib_table *__fib_new_table(int id); extern void fib_rule_put(struct fib_rule *r); @@ -191,7 +196,7 @@ return fib_tables[id] ? : __fib_new_table(id); } -extern void fib_select_default(const struct rt_key *key, struct fib_result *res); +extern void fib_select_default(const struct flowi *flp, struct fib_result *res); #endif /* CONFIG_IP_MULTIPLE_TABLES */ @@ -204,13 +209,13 @@ extern int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb); extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, struct net_device *dev, u32 *spec_dst, u32 *itag); -extern void fib_select_multipath(const struct rt_key *key, struct fib_result *res); +extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res); /* Exported by fib_semantics.c */ extern int ip_fib_check_default(u32 gw, struct net_device *dev); extern void fib_release_info(struct fib_info *); extern int fib_semantic_match(int type, struct fib_info *, - const struct rt_key *, struct fib_result*); + const struct flowi *, struct fib_result*); extern struct fib_info *fib_create_info(const struct rtmsg *r, struct kern_rta *rta, const struct nlmsghdr *, int *err); extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *, struct kern_rta *rta, struct fib_info *fi); Index: include/net/ip_vs.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/ip_vs.h,v retrieving revision 1.1.1.7 retrieving revision 1.1.1.7.2.1 diff -u -r1.1.1.7 -r1.1.1.7.2.1 --- a/include/net/ip_vs.h 14 Apr 2004 13:05:40 -0000 1.1.1.7 +++ b/include/net/ip_vs.h 16 Apr 2004 13:16:19 -0000 1.1.1.7.2.1 @@ -279,6 +279,13 @@ #define LeaveFunction(level) do {} while (0) #endif +#define IP_VS_XMIT(skb, rt) \ +do { \ + skb->nfcache |= NFC_IPVS_PROPERTY; \ + NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ + (rt)->u.dst.dev, dst_output); \ +} while (0) + /* * The port number of FTP service (in network order). @@ -859,7 +866,16 @@ spin_lock(&dest->dst_lock); if (!(rt = (struct rtable *) __ip_vs_dst_check(dest, rtos, 0))) { - if (ip_route_output(&rt, dest->addr, 0, rtos, 0)) { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = dest->addr, + .saddr = 0, + .tos = rtos, } }, + }; + + if (ip_route_output_key(&rt, &fl)) { spin_unlock(&dest->dst_lock); IP_VS_DBG_RL("ip_route_output error, " "dest: %u.%u.%u.%u\n", @@ -873,7 +889,16 @@ } spin_unlock(&dest->dst_lock); } else { - if (ip_route_output(&rt, cp->daddr, 0, rtos, 0)) { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = cp->daddr, + .saddr = 0, + .tos = rtos, } }, + }; + + if (ip_route_output_key(&rt, &fl)) { IP_VS_DBG_RL("ip_route_output error, dest: " "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); return NULL; Index: include/net/ipcomp.h =================================================================== RCS file: include/net/ipcomp.h diff -N include/net/ipcomp.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/include/net/ipcomp.h 16 Apr 2004 13:16:19 -0000 1.2.18.1 @@ -0,0 +1,12 @@ +#ifndef _NET_IPCOMP_H +#define _NET_IPCOMP_H + +#define IPCOMP_SCRATCH_SIZE 65400 + +struct ipcomp_data { + u16 threshold; + u8 *scratch; + struct crypto_tfm *tfm; +}; + +#endif Index: include/net/ipip.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/ipip.h,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/include/net/ipip.h 12 Apr 2001 19:11:39 -0000 1.1.1.15 +++ b/include/net/ipip.h 16 Apr 2004 13:16:19 -0000 1.1.1.15.2.1 @@ -34,7 +34,7 @@ ip_select_ident(iph, &rt->u.dst, NULL); \ ip_send_check(iph); \ \ - err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, do_ip_send); \ + err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output);\ if (err == NET_XMIT_SUCCESS || err == NET_XMIT_CN) { \ stats->tx_bytes += pkt_len; \ stats->tx_packets++; \ Index: include/net/ipv6.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/ipv6.h,v retrieving revision 1.1.1.20 retrieving revision 1.1.1.20.2.1 diff -u -r1.1.1.20 -r1.1.1.20.2.1 --- a/include/net/ipv6.h 14 Apr 2004 13:05:40 -0000 1.1.1.20 +++ b/include/net/ipv6.h 16 Apr 2004 13:16:19 -0000 1.1.1.20.2.1 @@ -22,6 +22,8 @@ #define SIN6_LEN_RFC2133 24 +#define IPV6_MAXPLEN 65535 + /* * NextHeader field of IPv6 header */ @@ -48,7 +50,7 @@ /* * Addr type * - * type - unicast | multicast | anycast + * type - unicast | multicast * scope - local | site | global * v4 - compat * v4mapped @@ -60,7 +62,6 @@ #define IPV6_ADDR_UNICAST 0x0001U #define IPV6_ADDR_MULTICAST 0x0002U -#define IPV6_ADDR_ANYCAST 0x0004U #define IPV6_ADDR_LOOPBACK 0x0010U #define IPV6_ADDR_LINKLOCAL 0x0020U @@ -98,6 +99,8 @@ __u32 identification; }; +#define IP6_MF 0x0001 + #ifdef __KERNEL__ #include @@ -199,12 +202,8 @@ extern int ip6_call_ra_chain(struct sk_buff *skb, int sel); -extern int ipv6_reassembly(struct sk_buff **skb, int); - extern int ipv6_parse_hopopts(struct sk_buff *skb, int); -extern int ipv6_parse_exthdrs(struct sk_buff **skb, int); - extern struct ipv6_txoptions * ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt); extern int ip6_frag_nqueues; @@ -239,6 +238,23 @@ memcpy((void *) a1, (const void *) a2, sizeof(struct in6_addr)); } +static inline void ipv6_addr_prefix(struct in6_addr *pfx, + const struct in6_addr *addr, + int plen) +{ + /* caller must guarantee 0 <= plen <= 128 */ + int o = plen >> 3, + b = plen & 0x7; + + memcpy(pfx->s6_addr, addr, o); + if (b != 0) { + pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b); + o++; + } + if (o < 16) + memset(pfx->s6_addr + o, 0, 16 - o); +} + #ifndef __HAVE_ARCH_ADDR_SET static inline void ipv6_addr_set(struct in6_addr *addr, __u32 w1, __u32 w2, @@ -291,6 +307,26 @@ unsigned length, struct ipv6_txoptions *opt, int hlimit, int flags); +extern int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr); + +extern int ip6_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), + void *from, + int length, + int transhdrlen, + int hlimit, + struct ipv6_txoptions *opt, + struct flowi *fl, + struct rt6_info *rt, + unsigned int flags); + +extern int ip6_push_pending_frames(struct sock *sk); + +extern void ip6_flush_pending_frames(struct sock *sk); + +extern int ip6_dst_lookup(struct sock *sk, + struct dst_entry **dst, + struct flowi *fl); /* * skb processing functions Index: include/net/ndisc.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/ndisc.h,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/include/net/ndisc.h 28 Nov 2002 23:53:15 -0000 1.1.1.15 +++ b/include/net/ndisc.h 16 Apr 2004 13:16:19 -0000 1.1.1.15.2.1 @@ -56,20 +56,6 @@ __u8 nd_opt_len; } __attribute__((__packed__)); -struct ndisc_options { - struct nd_opt_hdr *nd_opt_array[7]; - struct nd_opt_hdr *nd_opt_piend; -}; - -#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] -#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] -#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] -#define nd_opts_pi_end nd_opt_piend -#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] -#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] - -extern struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, struct nd_opt_hdr *end); -extern struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, struct ndisc_options *ndopts); extern int ndisc_init(struct net_proto_family *ops); Index: include/net/protocol.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/protocol.h,v retrieving revision 1.1.1.16 retrieving revision 1.1.1.16.2.1 diff -u -r1.1.1.16 -r1.1.1.16.2.1 --- a/include/net/protocol.h 22 Nov 2001 19:47:11 -0000 1.1.1.16 +++ b/include/net/protocol.h 16 Apr 2004 13:16:19 -0000 1.1.1.16.2.1 @@ -30,7 +30,7 @@ #include #endif -#define MAX_INET_PROTOS 32 /* Must be a power of 2 */ +#define MAX_INET_PROTOS 256 /* Must be a power of 2 */ /* This is used to register protocols. */ @@ -38,29 +38,23 @@ { int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); - struct inet_protocol *next; - unsigned char protocol; - unsigned char copy:1; - void *data; - const char *name; + int no_policy; }; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) struct inet6_protocol { - int (*handler)(struct sk_buff *skb); + int (*handler)(struct sk_buff **skb, unsigned int *nhoffp); void (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, int type, int code, int offset, __u32 info); - struct inet6_protocol *next; - unsigned char protocol; - unsigned char copy:1; - void *data; - const char *name; + unsigned int flags; /* INET6_PROTO_xxx */ }; +#define INET6_PROTO_NOPOLICY 0x1 +#define INET6_PROTO_FINAL 0x2 #endif /* This is used to register socket interfaces for IP protocols. */ @@ -93,14 +87,14 @@ extern struct list_head inetsw6[SOCK_MAX]; #endif -extern void inet_add_protocol(struct inet_protocol *prot); -extern int inet_del_protocol(struct inet_protocol *prot); +extern int inet_add_protocol(struct inet_protocol *prot, unsigned char num); +extern int inet_del_protocol(struct inet_protocol *prot, unsigned char num); extern void inet_register_protosw(struct inet_protosw *p); extern void inet_unregister_protosw(struct inet_protosw *p); #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) -extern void inet6_add_protocol(struct inet6_protocol *prot); -extern int inet6_del_protocol(struct inet6_protocol *prot); +extern int inet6_add_protocol(struct inet6_protocol *prot, unsigned char num); +extern int inet6_del_protocol(struct inet6_protocol *prot, unsigned char num); extern void inet6_register_protosw(struct inet_protosw *p); extern void inet6_unregister_protosw(struct inet_protosw *p); #endif Index: include/net/raw.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/raw.h,v retrieving revision 1.1.1.14 retrieving revision 1.1.1.14.2.1 diff -u -r1.1.1.14 -r1.1.1.14.2.1 --- a/include/net/raw.h 12 Apr 2001 19:11:39 -0000 1.1.1.14 +++ b/include/net/raw.h 16 Apr 2004 13:16:19 -0000 1.1.1.14.2.1 @@ -37,6 +37,6 @@ unsigned long raddr, unsigned long laddr, int dif); -extern struct sock *raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash); +extern void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash); #endif /* _RAW_H */ Index: include/net/rawv6.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/rawv6.h,v retrieving revision 1.1.1.14 retrieving revision 1.1.1.14.2.1 diff -u -r1.1.1.14 -r1.1.1.14.2.1 --- a/include/net/rawv6.h 12 Apr 2001 19:11:39 -0000 1.1.1.14 +++ b/include/net/rawv6.h 16 Apr 2004 13:16:19 -0000 1.1.1.14.2.1 @@ -7,9 +7,7 @@ extern struct sock *raw_v6_htable[RAWV6_HTABLE_SIZE]; extern rwlock_t raw_v6_lock; -extern struct sock * ipv6_raw_deliver(struct sk_buff *skb, - int nexthdr); - +extern void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr); extern struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, struct in6_addr *loc_addr, struct in6_addr *rmt_addr); Index: include/net/route.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/route.h,v retrieving revision 1.1.1.21 retrieving revision 1.1.1.21.2.1 diff -u -r1.1.1.21 -r1.1.1.21.2.1 --- a/include/net/route.h 25 Aug 2003 11:44:44 -0000 1.1.1.21 +++ b/include/net/route.h 16 Apr 2004 13:16:19 -0000 1.1.1.21.2.1 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -45,19 +46,6 @@ #define RT_CONN_FLAGS(sk) (RT_TOS(sk->protinfo.af_inet.tos) | sk->localroute) -struct rt_key -{ - __u32 dst; - __u32 src; - int iif; - int oif; -#ifdef CONFIG_IP_ROUTE_FWMARK - __u32 fwmark; -#endif - __u8 tos; - __u8 scope; -}; - struct inet_peer; struct rtable { @@ -78,7 +66,7 @@ __u32 rt_gateway; /* Cache lookup keys */ - struct rt_key key; + struct flowi fl; /* Miscellaneous cached information */ __u32 rt_spec_dst; /* RFC1122 specific destination */ @@ -126,10 +114,11 @@ u32 src, u8 tos, struct net_device *dev); extern void ip_rt_advice(struct rtable **rp, int advice); extern void rt_cache_flush(int how); -extern int ip_route_output_key(struct rtable **, const struct rt_key *key); +extern int __ip_route_output_key(struct rtable **, const struct flowi *flp); +extern int ip_route_output_key(struct rtable **, struct flowi *flp); +extern int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags); extern int ip_route_input(struct sk_buff*, u32 dst, u32 src, u8 tos, struct net_device *devin); extern unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu); -extern void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu); extern void ip_rt_send_redirect(struct sk_buff *skb); extern unsigned inet_addr_type(u32 addr); @@ -138,16 +127,6 @@ extern void ip_rt_get_source(u8 *src, struct rtable *rt); extern int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb); -/* Deprecated: use ip_route_output_key directly */ -static inline int ip_route_output(struct rtable **rp, - u32 daddr, u32 saddr, u32 tos, int oif) -{ - struct rt_key key = { dst:daddr, src:saddr, oif:oif, tos:tos }; - - return ip_route_output_key(rp, &key); -} - - static inline void ip_rt_put(struct rtable * rt) { if (rt) @@ -163,17 +142,47 @@ return ip_tos2prio[IPTOS_TOS(tos)>>1]; } -static inline int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32 tos, int oif) -{ +static inline int ip_route_connect(struct rtable **rp, u32 dst, + u32 src, u32 tos, int oif, u8 protocol, + u16 sport, u16 dport, struct sock *sk) +{ + struct flowi fl = { .oif = oif, + .nl_u = { .ip4_u = { .daddr = dst, + .saddr = src, + .tos = tos } }, + .proto = protocol, + .uli_u = { .ports = + { .sport = sport, + .dport = dport } } }; + int err; - err = ip_route_output(rp, dst, src, tos, oif); - if (err || (dst && src)) - return err; - dst = (*rp)->rt_dst; - src = (*rp)->rt_src; - ip_rt_put(*rp); - *rp = NULL; - return ip_route_output(rp, dst, src, tos, oif); + if (!dst || !src) { + err = __ip_route_output_key(rp, &fl); + if (err) + return err; + fl.fl4_dst = (*rp)->rt_dst; + fl.fl4_src = (*rp)->rt_src; + ip_rt_put(*rp); + *rp = NULL; + } + return ip_route_output_flow(rp, &fl, sk, 0); +} + +static inline int ip_route_newports(struct rtable **rp, u16 sport, u16 dport, + struct sock *sk) +{ + if (sport != (*rp)->fl.fl_ip_sport || + dport != (*rp)->fl.fl_ip_dport) { + struct flowi fl; + + memcpy(&fl, &(*rp)->fl, sizeof(fl)); + fl.fl_ip_sport = sport; + fl.fl_ip_dport = dport; + ip_rt_put(*rp); + *rp = NULL; + return ip_route_output_flow(rp, &fl, sk, 0); + } + return 0; } extern void rt_bind_peer(struct rtable *rt, int create); Index: include/net/sock.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/sock.h,v retrieving revision 1.1.1.27 retrieving revision 1.1.1.27.2.1 diff -u -r1.1.1.27 -r1.1.1.27.2.1 --- a/include/net/sock.h 14 Apr 2004 13:05:40 -0000 1.1.1.27 +++ b/include/net/sock.h 16 Apr 2004 13:16:19 -0000 1.1.1.27.2.1 @@ -45,6 +45,8 @@ #include /* struct ipv6_mc_socklist */ #endif +#include + #if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) #include #endif @@ -184,6 +186,12 @@ struct ipv6_txoptions *opt; struct sk_buff *pktoptions; + struct { + struct ipv6_txoptions *opt; + struct rt6_info *rt; + struct flowi fl; + int hop_limit; + } cork; }; struct raw6_opt { @@ -210,7 +218,7 @@ #if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) struct inet_opt { - int ttl; /* TTL setting */ + int uc_ttl; /* Unicast TTL */ int tos; /* TOS */ unsigned cmsg_flags; struct ip_options *opt; @@ -224,7 +232,24 @@ int mc_index; /* Multicast device index */ __u32 mc_addr; struct ip_mc_socklist *mc_list; /* Group array */ + struct page *sndmsg_page; /* Cached page for sendmsg */ + u32 sndmsg_off; /* Cached offset for sendmsg */ + /* + * Following members are used to retain the infomation to build + * an ip header on each ip fragmentation while the socket is corked. + */ + struct { + unsigned int flags; + unsigned int fragsize; + struct ip_options *opt; + struct rtable *rt; + int length; /* Total length of all frames */ + u32 addr; + } cork; }; + +#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ + #endif #if defined(CONFIG_PPPOE) || defined (CONFIG_PPPOE_MODULE) @@ -250,6 +275,14 @@ #define pppoe_relay proto.pppoe.relay #endif +#if defined(CONFIG_NET_KEY) || defined(CONFIG_NET_KEY_MODULE) +struct pfkey_opt { + int registered; + int promisc; +}; +#define pfkey_sk(__sk) ((__sk)->protinfo.pf_key) +#endif + /* This defines a selective acknowledgement block. */ struct tcp_sack_block { __u32 start_seq; @@ -307,6 +340,7 @@ __u16 mss_cache; /* Cached effective mss, not including SACKS */ __u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ + __u16 ext2_header_len;/* Options depending on route */ __u8 ca_state; /* State of fast-retransmit machine */ __u8 retransmits; /* Number of unrecovered RTO timeouts. */ @@ -347,8 +381,6 @@ struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */ struct sk_buff *send_head; /* Front of stuff to transmit */ - struct page *sndmsg_page; /* Cached page for sendmsg */ - u32 sndmsg_off; /* Cached offset for sendmsg */ __u32 rcv_wnd; /* Current receiver window */ __u32 rcv_wup; /* rcv_nxt on last window update sent */ @@ -447,6 +479,20 @@ } westwood; }; +struct udp_opt { + int pending; /* Any pending frames ? */ + unsigned int corkflag; /* Cork is required */ + __u16 encap_type; /* Is this an Encapsulation socket? */ + /* + * Following members retains the infomation to create a UDP header + * when the socket is uncorked. + */ + u32 saddr; /* source address */ + u32 daddr; /* destination address */ + __u16 sport; /* source port */ + __u16 dport; /* destination port */ + __u16 len; /* total length of pending frames */ +}; /* * This structure really needs to be cleaned up. @@ -542,6 +588,7 @@ wait_queue_head_t *sleep; /* Sock wait queue */ struct dst_entry *dst_cache; /* Destination cache */ rwlock_t dst_lock; + struct xfrm_policy *policy[2]; atomic_t rmem_alloc; /* Receive queue bytes committed */ struct sk_buff_head receive_queue; /* Incoming packets */ atomic_t wmem_alloc; /* Transmit queue bytes committed */ @@ -598,10 +645,12 @@ union { struct ipv6_pinfo af_inet6; } net_pinfo; +#define inet6_sk(sk) (&(sk)->net_pinfo.af_inet6) #endif union { struct tcp_opt af_tcp; + struct udp_opt af_udp; #if defined(CONFIG_IP_SCTP) || defined (CONFIG_IP_SCTP_MODULE) struct sctp_opt af_sctp; #endif @@ -616,6 +665,10 @@ #endif /* CONFIG_SPX */ } tp_pinfo; +#define tcp_sk(sk) (&(sk)->tp_pinfo.af_tcp) +#define udp_sk(sk) (&(sk)->tp_pinfo.af_udp) +#define raw_sk(sk) (&(sk)->tp_pinfo.tp_raw4) +#define raw6_sk(sk) (&(sk)->tp_pinfo.tp_raw) int err, err_soft; /* Soft holds errors that don't cause failure but are the cause @@ -686,8 +739,11 @@ #if defined(CONFIG_WAN_ROUTER) || defined(CONFIG_WAN_ROUTER_MODULE) struct wanpipe_opt *af_wanpipe; #endif +#if defined(CONFIG_NET_KEY) || defined(CONFIG_NET_KEY_MODULE) + struct pfkey_opt *pf_key; +#endif } protinfo; - +#define inet_sk(sk) (&(sk)->protinfo.af_inet) /* This part is used for the timeout functions. */ struct timer_list timer; /* This is the sock cleanup timer. */ @@ -751,6 +807,8 @@ int (*recvmsg)(struct sock *sk, struct msghdr *msg, int len, int noblock, int flags, int *addr_len); + int (*sendpage)(struct sock *sk, struct page *page, + int offset, size_t size, int flags); int (*bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len); Index: include/net/tcp.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/tcp.h,v retrieving revision 1.1.1.24 retrieving revision 1.1.1.24.2.1 diff -u -r1.1.1.24 -r1.1.1.24.2.1 --- a/include/net/tcp.h 14 Apr 2004 13:05:40 -0000 1.1.1.24 +++ b/include/net/tcp.h 16 Apr 2004 13:16:19 -0000 1.1.1.24.2.1 @@ -546,13 +546,6 @@ /* * Pointers to address related TCP functions * (i.e. things that depend on the address family) - * - * BUGGG_FUTURE: all the idea behind this struct is wrong. - * It mixes socket frontend with transport function. - * With port sharing between IPv6/v4 it gives the only advantage, - * only poor IPv6 needs to permanently recheck, that it - * is still IPv6 8)8) It must be cleaned up as soon as possible. - * --ANK (980802) */ struct tcp_func { @@ -909,9 +902,12 @@ struct dst_entry *dst = __sk_dst_get(sk); int mss_now = tp->mss_cache; - if (dst && dst->pmtu != tp->pmtu_cookie) - mss_now = tcp_sync_mss(sk, dst->pmtu); - + if (dst) { + u32 mtu = dst_pmtu(dst); + if (mtu != tp->pmtu_cookie || + tp->ext2_header_len != dst->header_len) + mss_now = tcp_sync_mss(sk, mtu); + } if (tp->eff_sacks) mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)); @@ -1152,7 +1148,7 @@ } } -extern __u32 tcp_init_cwnd(struct tcp_opt *tp); +extern __u32 tcp_init_cwnd(struct tcp_opt *tp, struct dst_entry *dst); /* Slow start with delack produces 3 packets of burst, so that * it is safe "de facto". Index: include/net/transp_v6.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/transp_v6.h,v retrieving revision 1.1.1.14 retrieving revision 1.1.1.14.2.1 diff -u -r1.1.1.14 -r1.1.1.14.2.1 --- a/include/net/transp_v6.h 13 Jun 2003 14:51:39 -0000 1.1.1.14 +++ b/include/net/transp_v6.h 16 Apr 2004 13:16:19 -0000 1.1.1.14.2.1 @@ -17,6 +17,13 @@ extern void ipv6_frag_init(void); +/* extention headers */ +extern void ipv6_rthdr_init(void); +extern void ipv6_frag_init(void); +extern void ipv6_nodata_init(void); +extern void ipv6_destopt_init(void); + +/* transport protocols */ extern void rawv6_init(void); extern void udpv6_init(void); extern void tcpv6_init(void); Index: include/net/xfrm.h =================================================================== RCS file: include/net/xfrm.h diff -N include/net/xfrm.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/include/net/xfrm.h 16 Apr 2004 13:16:19 -0000 1.9.8.1 @@ -0,0 +1,893 @@ +#ifndef _NET_XFRM_H +#define _NET_XFRM_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define XFRM_ALIGN8(len) (((len) + 7) & ~7) + +extern struct semaphore xfrm_cfg_sem; + +/* Organization of SPD aka "XFRM rules" + ------------------------------------ + + Basic objects: + - policy rule, struct xfrm_policy (=SPD entry) + - bundle of transformations, struct dst_entry == struct xfrm_dst (=SA bundle) + - instance of a transformer, struct xfrm_state (=SA) + - template to clone xfrm_state, struct xfrm_tmpl + + SPD is plain linear list of xfrm_policy rules, ordered by priority. + (To be compatible with existing pfkeyv2 implementations, + many rules with priority of 0x7fffffff are allowed to exist and + such rules are ordered in an unpredictable way, thanks to bsd folks.) + + Lookup is plain linear search until the first match with selector. + + If "action" is "block", then we prohibit the flow, otherwise: + if "xfrms_nr" is zero, the flow passes untransformed. Otherwise, + policy entry has list of up to XFRM_MAX_DEPTH transformations, + described by templates xfrm_tmpl. Each template is resolved + to a complete xfrm_state (see below) and we pack bundle of transformations + to a dst_entry returned to requestor. + + dst -. xfrm .-> xfrm_state #1 + |---. child .-> dst -. xfrm .-> xfrm_state #2 + |---. child .-> dst -. xfrm .-> xfrm_state #3 + |---. child .-> NULL + + Bundles are cached at xrfm_policy struct (field ->bundles). + + + Resolution of xrfm_tmpl + ----------------------- + Template contains: + 1. ->mode Mode: transport or tunnel + 2. ->id.proto Protocol: AH/ESP/IPCOMP + 3. ->id.daddr Remote tunnel endpoint, ignored for transport mode. + Q: allow to resolve security gateway? + 4. ->id.spi If not zero, static SPI. + 5. ->saddr Local tunnel endpoint, ignored for transport mode. + 6. ->algos List of allowed algos. Plain bitmask now. + Q: ealgos, aalgos, calgos. What a mess... + 7. ->share Sharing mode. + Q: how to implement private sharing mode? To add struct sock* to + flow id? + + Having this template we search through SAD searching for entries + with appropriate mode/proto/algo, permitted by selector. + If no appropriate entry found, it is requested from key manager. + + PROBLEMS: + Q: How to find all the bundles referring to a physical path for + PMTU discovery? Seems, dst should contain list of all parents... + and enter to infinite locking hierarchy disaster. + No! It is easier, we will not search for them, let them find us. + We add genid to each dst plus pointer to genid of raw IP route, + pmtu disc will update pmtu on raw IP route and increase its genid. + dst_check() will see this for top level and trigger resyncing + metrics. Plus, it will be made via sk->dst_cache. Solved. + */ + +/* Full description of state of transformer. */ +struct xfrm_state +{ + /* Note: bydst is re-used during gc */ + struct list_head bydst; + struct list_head byspi; + + atomic_t refcnt; + spinlock_t lock; + + struct xfrm_id id; + struct xfrm_selector sel; + + /* Key manger bits */ + struct { + u8 state; + u8 dying; + u32 seq; + } km; + + /* Parameters of this state. */ + struct { + u32 reqid; + u8 mode; + u8 replay_window; + u8 aalgo, ealgo, calgo; + u8 flags; + u16 family; + xfrm_address_t saddr; + int header_len; + int trailer_len; + } props; + + struct xfrm_lifetime_cfg lft; + + /* Data for transformer */ + struct xfrm_algo *aalg; + struct xfrm_algo *ealg; + struct xfrm_algo *calg; + + /* Data for encapsulator */ + struct xfrm_encap_tmpl *encap; + + /* IPComp needs an IPIP tunnel for handling uncompressed packets */ + struct xfrm_state *tunnel; + + /* If a tunnel, number of users + 1 */ + atomic_t tunnel_users; + + /* State for replay detection */ + struct xfrm_replay_state replay; + + /* Statistics */ + struct xfrm_stats stats; + + struct xfrm_lifetime_cur curlft; + struct timer_list timer; + + /* Reference to data common to all the instances of this + * transformer. */ + struct xfrm_type *type; + + /* Private data of this transformer, format is opaque, + * interpreted by xfrm_type methods. */ + void *data; +}; + +enum { + XFRM_STATE_VOID, + XFRM_STATE_ACQ, + XFRM_STATE_VALID, + XFRM_STATE_ERROR, + XFRM_STATE_EXPIRED, + XFRM_STATE_DEAD +}; + +struct xfrm_type; +struct xfrm_dst; +struct xfrm_policy_afinfo { + unsigned short family; + rwlock_t lock; + struct xfrm_type_map *type_map; + struct dst_ops *dst_ops; + void (*garbage_collect)(void); + int (*dst_lookup)(struct xfrm_dst **dst, struct flowi *fl); + struct dst_entry *(*find_bundle)(struct flowi *fl, struct xfrm_policy *policy); + int (*bundle_create)(struct xfrm_policy *policy, + struct xfrm_state **xfrm, + int nx, + struct flowi *fl, + struct dst_entry **dst_p); + void (*decode_session)(struct sk_buff *skb, + struct flowi *fl); +}; + +extern int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo); +extern int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo); +extern struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); +extern void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); + +#define XFRM_ACQ_EXPIRES 30 + +struct xfrm_tmpl; +struct xfrm_state_afinfo { + unsigned short family; + rwlock_t lock; + struct list_head *state_bydst; + struct list_head *state_byspi; + void (*init_tempsel)(struct xfrm_state *x, struct flowi *fl, + struct xfrm_tmpl *tmpl, + xfrm_address_t *daddr, xfrm_address_t *saddr); + struct xfrm_state *(*state_lookup)(xfrm_address_t *daddr, u32 spi, u8 proto); + struct xfrm_state *(*find_acq)(u8 mode, u32 reqid, u8 proto, + xfrm_address_t *daddr, xfrm_address_t *saddr, + int create); +}; + +extern int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo); +extern int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo); +extern struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family); +extern void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo); + +extern void xfrm_state_delete_tunnel(struct xfrm_state *x); + +struct xfrm_decap_state; +struct xfrm_type +{ + char *description; + struct module *owner; + __u8 proto; + + int (*init_state)(struct xfrm_state *x, void *args); + void (*destructor)(struct xfrm_state *); + int (*input)(struct xfrm_state *, struct xfrm_decap_state *, struct sk_buff *skb); + int (*post_input)(struct xfrm_state *, struct xfrm_decap_state *, struct sk_buff *skb); + int (*output)(struct sk_buff *skb); + /* Estimate maximal size of result of transformation of a dgram */ + u32 (*get_max_size)(struct xfrm_state *, int size); +}; + +struct xfrm_type_map { + rwlock_t lock; + struct xfrm_type *map[256]; +}; + +extern int xfrm_register_type(struct xfrm_type *type, unsigned short family); +extern int xfrm_unregister_type(struct xfrm_type *type, unsigned short family); +extern struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family); +extern void xfrm_put_type(struct xfrm_type *type); + +struct xfrm_tmpl +{ +/* id in template is interpreted as: + * daddr - destination of tunnel, may be zero for transport mode. + * spi - zero to acquire spi. Not zero if spi is static, then + * daddr must be fixed too. + * proto - AH/ESP/IPCOMP + */ + struct xfrm_id id; + +/* Source address of tunnel. Ignored, if it is not a tunnel. */ + xfrm_address_t saddr; + + __u32 reqid; + +/* Mode: transport/tunnel */ + __u8 mode; + +/* Sharing mode: unique, this session only, this user only etc. */ + __u8 share; + +/* May skip this transfomration if no SA is found */ + __u8 optional; + +/* Bit mask of algos allowed for acquisition */ + __u32 aalgos; + __u32 ealgos; + __u32 calgos; +}; + +#define XFRM_MAX_DEPTH 4 + +struct xfrm_policy +{ + struct xfrm_policy *next; + struct list_head list; + + /* This lock only affects elements except for entry. */ + rwlock_t lock; + atomic_t refcnt; + struct timer_list timer; + + u32 priority; + u32 index; + struct xfrm_selector selector; + struct xfrm_lifetime_cfg lft; + struct xfrm_lifetime_cur curlft; + struct dst_entry *bundles; + __u16 family; + __u8 action; + __u8 flags; + __u8 dead; + __u8 xfrm_nr; + struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; +}; + +#define XFRM_KM_TIMEOUT 30 + +struct xfrm_mgr +{ + struct list_head list; + char *id; + int (*notify)(struct xfrm_state *x, int event); + int (*acquire)(struct xfrm_state *x, struct xfrm_tmpl *, struct xfrm_policy *xp, int dir); + struct xfrm_policy *(*compile_policy)(u16 family, int opt, u8 *data, int len, int *dir); + int (*new_mapping)(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport); + int (*notify_policy)(struct xfrm_policy *x, int dir, int event); +}; + +extern int xfrm_register_km(struct xfrm_mgr *km); +extern int xfrm_unregister_km(struct xfrm_mgr *km); + + +#define XFRM_FLOWCACHE_HASH_SIZE 1024 + +static inline u32 __flow_hash4(struct flowi *fl) +{ + u32 hash = fl->fl4_src ^ fl->fl_ip_sport; + + hash = ((hash & 0xF0F0F0F0) >> 4) | ((hash & 0x0F0F0F0F) << 4); + + hash ^= fl->fl4_dst ^ fl->fl_ip_dport; + hash ^= (hash >> 10); + hash ^= (hash >> 20); + return hash & (XFRM_FLOWCACHE_HASH_SIZE-1); +} + +static inline u32 __flow_hash6(struct flowi *fl) +{ + u32 hash = fl->fl6_src.s6_addr32[2] ^ + fl->fl6_src.s6_addr32[3] ^ + fl->fl_ip_sport; + + hash = ((hash & 0xF0F0F0F0) >> 4) | ((hash & 0x0F0F0F0F) << 4); + + hash ^= fl->fl6_dst.s6_addr32[2] ^ + fl->fl6_dst.s6_addr32[3] ^ + fl->fl_ip_dport; + hash ^= (hash >> 10); + hash ^= (hash >> 20); + return hash & (XFRM_FLOWCACHE_HASH_SIZE-1); +} + +static inline u32 flow_hash(struct flowi *fl, unsigned short family) +{ + switch (family) { + case AF_INET: + return __flow_hash4(fl); + case AF_INET6: + return __flow_hash6(fl); + } + return 0; /*XXX*/ +} + +extern struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2]; + +static inline void xfrm_pol_hold(struct xfrm_policy *policy) +{ + if (likely(policy != NULL)) + atomic_inc(&policy->refcnt); +} + +extern void __xfrm_policy_destroy(struct xfrm_policy *policy); + +static inline void xfrm_pol_put(struct xfrm_policy *policy) +{ + if (atomic_dec_and_test(&policy->refcnt)) + __xfrm_policy_destroy(policy); +} + +#define XFRM_DST_HSIZE 1024 + +static __inline__ +unsigned __xfrm4_dst_hash(xfrm_address_t *addr) +{ + unsigned h; + h = ntohl(addr->a4); + h = (h ^ (h>>16)) % XFRM_DST_HSIZE; + return h; +} + +static __inline__ +unsigned __xfrm6_dst_hash(xfrm_address_t *addr) +{ + unsigned h; + h = ntohl(addr->a6[2]^addr->a6[3]); + h = (h ^ (h>>16)) % XFRM_DST_HSIZE; + return h; +} + +static __inline__ +unsigned xfrm_dst_hash(xfrm_address_t *addr, unsigned short family) +{ + switch (family) { + case AF_INET: + return __xfrm4_dst_hash(addr); + case AF_INET6: + return __xfrm6_dst_hash(addr); + } + return 0; +} + +static __inline__ +unsigned __xfrm4_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto) +{ + unsigned h; + h = ntohl(addr->a4^spi^proto); + h = (h ^ (h>>10) ^ (h>>20)) % XFRM_DST_HSIZE; + return h; +} + +static __inline__ +unsigned __xfrm6_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto) +{ + unsigned h; + h = ntohl(addr->a6[2]^addr->a6[3]^spi^proto); + h = (h ^ (h>>10) ^ (h>>20)) % XFRM_DST_HSIZE; + return h; +} + +static __inline__ +unsigned xfrm_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto, unsigned short family) +{ + switch (family) { + case AF_INET: + return __xfrm4_spi_hash(addr, spi, proto); + case AF_INET6: + return __xfrm6_spi_hash(addr, spi, proto); + } + return 0; /*XXX*/ +} + +extern void __xfrm_state_destroy(struct xfrm_state *); + +static inline void xfrm_state_put(struct xfrm_state *x) +{ + if (atomic_dec_and_test(&x->refcnt)) + __xfrm_state_destroy(x); +} + +static inline void xfrm_state_hold(struct xfrm_state *x) +{ + atomic_inc(&x->refcnt); +} + +static __inline__ int addr_match(void *token1, void *token2, int prefixlen) +{ + __u32 *a1 = token1; + __u32 *a2 = token2; + int pdw; + int pbi; + + pdw = prefixlen >> 5; /* num of whole __u32 in prefix */ + pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */ + + if (pdw) + if (memcmp(a1, a2, pdw << 2)) + return 0; + + if (pbi) { + __u32 mask; + + mask = htonl((0xffffffff) << (32 - pbi)); + + if ((a1[pdw] ^ a2[pdw]) & mask) + return 0; + } + + return 1; +} + +static inline int +__xfrm4_selector_match(struct xfrm_selector *sel, struct flowi *fl) +{ + return addr_match(&fl->fl4_dst, &sel->daddr, sel->prefixlen_d) && + addr_match(&fl->fl4_src, &sel->saddr, sel->prefixlen_s) && + !((fl->fl_ip_dport^sel->dport)&sel->dport_mask) && + !((fl->fl_ip_sport^sel->sport)&sel->sport_mask) && + (fl->proto == sel->proto || !sel->proto) && + (fl->oif == sel->ifindex || !sel->ifindex); +} + +static inline int +__xfrm6_selector_match(struct xfrm_selector *sel, struct flowi *fl) +{ + return addr_match(&fl->fl6_dst, &sel->daddr, sel->prefixlen_d) && + addr_match(&fl->fl6_src, &sel->saddr, sel->prefixlen_s) && + !((fl->fl_ip_dport^sel->dport)&sel->dport_mask) && + !((fl->fl_ip_sport^sel->sport)&sel->sport_mask) && + (fl->proto == sel->proto || !sel->proto) && + (fl->oif == sel->ifindex || !sel->ifindex); +} + +static inline int +xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl, + unsigned short family) +{ + switch (family) { + case AF_INET: + return __xfrm4_selector_match(sel, fl); + case AF_INET6: + return __xfrm6_selector_match(sel, fl); + } + return 0; +} + +/* placeholder until xfrm6_tunnel.c is written */ +static inline int xfrm6_tunnel_check_size(struct sk_buff *skb) +{ return 0; } + +/* A struct encoding bundle of transformations to apply to some set of flow. + * + * dst->child points to the next element of bundle. + * dst->xfrm points to an instanse of transformer. + * + * Due to unfortunate limitations of current routing cache, which we + * have no time to fix, it mirrors struct rtable and bound to the same + * routing key, including saddr,daddr. However, we can have many of + * bundles differing by session id. All the bundles grow from a parent + * policy rule. + */ +struct xfrm_dst +{ + union { + struct xfrm_dst *next; + struct dst_entry dst; + struct rtable rt; + struct rt6_info rt6; + } u; +}; + +/* Decapsulation state, used by the input to store data during + * decapsulation procedure, to be used later (during the policy + * check + */ +struct xfrm_decap_state { + char decap_data[20]; + __u16 decap_type; +}; + +struct sec_decap_state { + struct xfrm_state *xvec; + struct xfrm_decap_state decap; +}; + +struct sec_path +{ + atomic_t refcnt; + int len; + struct sec_decap_state x[XFRM_MAX_DEPTH]; +}; + +static inline struct sec_path * +secpath_get(struct sec_path *sp) +{ + if (sp) + atomic_inc(&sp->refcnt); + return sp; +} + +extern void __secpath_destroy(struct sec_path *sp); + +static inline void +secpath_put(struct sec_path *sp) +{ + if (sp && atomic_dec_and_test(&sp->refcnt)) + __secpath_destroy(sp); +} + +extern struct sec_path *secpath_dup(struct sec_path *src); + +static inline void +secpath_reset(struct sk_buff *skb) +{ +#ifdef CONFIG_XFRM + secpath_put(skb->sp); + skb->sp = NULL; +#endif +} + +static inline int +__xfrm4_state_addr_cmp(struct xfrm_tmpl *tmpl, struct xfrm_state *x) +{ + return (tmpl->saddr.a4 && + tmpl->saddr.a4 != x->props.saddr.a4); +} + +static inline int +__xfrm6_state_addr_cmp(struct xfrm_tmpl *tmpl, struct xfrm_state *x) +{ + return (!ipv6_addr_any((struct in6_addr*)&tmpl->saddr) && + ipv6_addr_cmp((struct in6_addr *)&tmpl->saddr, (struct in6_addr*)&x->props.saddr)); +} + +static inline int +xfrm_state_addr_cmp(struct xfrm_tmpl *tmpl, struct xfrm_state *x, unsigned short family) +{ + switch (family) { + case AF_INET: + return __xfrm4_state_addr_cmp(tmpl, x); + case AF_INET6: + return __xfrm6_state_addr_cmp(tmpl, x); + } + return !0; +} + +#ifdef CONFIG_XFRM + +extern int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb, unsigned short family); + +static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) +{ + if (sk && sk->policy[XFRM_POLICY_IN]) + return __xfrm_policy_check(sk, dir, skb, family); + + return !xfrm_policy_list[dir] || + (skb->dst->flags & DST_NOPOLICY) || + __xfrm_policy_check(sk, dir, skb, family); +} + +static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb) +{ + return xfrm_policy_check(sk, dir, skb, AF_INET); +} + +static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb) +{ + return xfrm_policy_check(sk, dir, skb, AF_INET6); +} + + +extern int __xfrm_route_forward(struct sk_buff *skb, unsigned short family); + +static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family) +{ + return !xfrm_policy_list[XFRM_POLICY_OUT] || + (skb->dst->flags & DST_NOXFRM) || + __xfrm_route_forward(skb, family); +} + +static inline int xfrm4_route_forward(struct sk_buff *skb) +{ + return xfrm_route_forward(skb, AF_INET); +} + +static inline int xfrm6_route_forward(struct sk_buff *skb) +{ + return xfrm_route_forward(skb, AF_INET6); +} + +extern int __xfrm_sk_clone_policy(struct sock *sk); + +static inline int xfrm_sk_clone_policy(struct sock *sk) +{ + if (unlikely(sk->policy[0] || sk->policy[1])) + return __xfrm_sk_clone_policy(sk); + return 0; +} + +extern void xfrm_policy_delete(struct xfrm_policy *pol, int dir); + +static inline void xfrm_sk_free_policy(struct sock *sk) +{ + if (unlikely(sk->policy[0] != NULL)) { + xfrm_policy_delete(sk->policy[0], XFRM_POLICY_MAX); + sk->policy[0] = NULL; + } + if (unlikely(sk->policy[1] != NULL)) { + xfrm_policy_delete(sk->policy[1], XFRM_POLICY_MAX+1); + sk->policy[1] = NULL; + } +} + +#else + +static inline void xfrm_sk_free_policy(struct sock *sk) {} +static inline int xfrm_sk_clone_policy(struct sock *sk) { return 0; } +static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; } +static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; } +static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb) +{ + return 1; +} +static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb) +{ + return 1; +} +static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) +{ + return 1; +} +#endif + +static __inline__ +xfrm_address_t *xfrm_flowi_daddr(struct flowi *fl, unsigned short family) +{ + switch (family){ + case AF_INET: + return (xfrm_address_t *)&fl->fl4_dst; + case AF_INET6: + return (xfrm_address_t *)&fl->fl6_dst; + } + return NULL; +} + +static __inline__ +xfrm_address_t *xfrm_flowi_saddr(struct flowi *fl, unsigned short family) +{ + switch (family){ + case AF_INET: + return (xfrm_address_t *)&fl->fl4_src; + case AF_INET6: + return (xfrm_address_t *)&fl->fl6_src; + } + return NULL; +} + +static __inline__ int +__xfrm4_state_addr_check(struct xfrm_state *x, + xfrm_address_t *daddr, xfrm_address_t *saddr) +{ + if (daddr->a4 == x->id.daddr.a4 && + (saddr->a4 == x->props.saddr.a4 || !saddr->a4 || !x->props.saddr.a4)) + return 1; + return 0; +} + +static __inline__ int +__xfrm6_state_addr_check(struct xfrm_state *x, + xfrm_address_t *daddr, xfrm_address_t *saddr) +{ + if (!ipv6_addr_cmp((struct in6_addr *)daddr, (struct in6_addr *)&x->id.daddr) && + (!ipv6_addr_cmp((struct in6_addr *)saddr, (struct in6_addr *)&x->props.saddr)|| + ipv6_addr_any((struct in6_addr *)saddr) || + ipv6_addr_any((struct in6_addr *)&x->props.saddr))) + return 1; + return 0; +} + +static __inline__ int +xfrm_state_addr_check(struct xfrm_state *x, + xfrm_address_t *daddr, xfrm_address_t *saddr, + unsigned short family) +{ + switch (family) { + case AF_INET: + return __xfrm4_state_addr_check(x, daddr, saddr); + case AF_INET6: + return __xfrm6_state_addr_check(x, daddr, saddr); + } + return 0; +} + +static inline int xfrm_state_kern(struct xfrm_state *x) +{ + return atomic_read(&x->tunnel_users); +} + +/* + * xfrm algorithm information + */ +struct xfrm_algo_auth_info { + u16 icv_truncbits; + u16 icv_fullbits; +}; + +struct xfrm_algo_encr_info { + u16 blockbits; + u16 defkeybits; +}; + +struct xfrm_algo_comp_info { + u16 threshold; +}; + +struct xfrm_algo_desc { + char *name; + u8 available:1; + union { + struct xfrm_algo_auth_info auth; + struct xfrm_algo_encr_info encr; + struct xfrm_algo_comp_info comp; + } uinfo; + struct sadb_alg desc; +}; + +/* XFRM tunnel handlers. */ +struct xfrm_tunnel { + int (*handler)(struct sk_buff *skb); + void (*err_handler)(struct sk_buff *skb, void *info); +}; + +extern void xfrm_init(void); +extern void xfrm4_init(void); +extern void xfrm4_fini(void); +extern void xfrm6_init(void); +extern void xfrm6_fini(void); +extern void xfrm_state_init(void); +extern void xfrm4_state_init(void); +extern void xfrm4_state_fini(void); +extern void xfrm6_state_init(void); +extern void xfrm6_state_fini(void); + +extern int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*), void *); +extern struct xfrm_state *xfrm_state_alloc(void); +extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, + struct flowi *fl, struct xfrm_tmpl *tmpl, + struct xfrm_policy *pol, int *err, + unsigned short family); +extern int xfrm_state_check_expire(struct xfrm_state *x); +extern void xfrm_state_insert(struct xfrm_state *x); +extern int xfrm_state_add(struct xfrm_state *x); +extern int xfrm_state_update(struct xfrm_state *x); +extern int xfrm_state_check_space(struct xfrm_state *x, struct sk_buff *skb); +extern struct xfrm_state *xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto, unsigned short family); +extern struct xfrm_state *xfrm_find_acq_byseq(u32 seq); +extern void xfrm_state_delete(struct xfrm_state *x); +extern void xfrm_state_flush(u8 proto); +extern int xfrm_replay_check(struct xfrm_state *x, u32 seq); +extern void xfrm_replay_advance(struct xfrm_state *x, u32 seq); +extern int xfrm_check_selectors(struct xfrm_state **x, int n, struct flowi *fl); +extern int xfrm_check_output(struct xfrm_state *x, struct sk_buff *skb, unsigned short family); +extern int xfrm4_rcv(struct sk_buff *skb); +extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler); +extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler); +extern int xfrm4_tunnel_check_size(struct sk_buff *skb); +extern int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp); + +#ifdef CONFIG_XFRM +extern int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type); +extern int xfrm_user_policy(struct sock *sk, int optname, u8 *optval, int optlen); +extern int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsigned short family); +#else +static inline int xfrm_user_policy(struct sock *sk, int optname, u8 *optval, int optlen) +{ + return -ENOPROTOOPT; +} + +static inline int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) +{ + /* should not happen */ + kfree_skb(skb); + return 0; +} +static inline int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsigned short family) +{ + return -EINVAL; +} +#endif + +void xfrm_policy_init(void); +void xfrm4_policy_init(void); +void xfrm6_policy_init(void); +struct xfrm_policy *xfrm_policy_alloc(int gfp); +extern int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), void *); +int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); +struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, + int delete); +struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete); +void xfrm_policy_flush(void); +u32 xfrm_get_acqseq(void); +void xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); +struct xfrm_state * xfrm_find_acq(u8 mode, u32 reqid, u8 proto, + xfrm_address_t *daddr, xfrm_address_t *saddr, + int create, unsigned short family); +extern void xfrm_policy_flush(void); +extern void xfrm_policy_kill(struct xfrm_policy *); +extern int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol); +extern struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl); +extern int xfrm_flush_bundles(void); + +extern wait_queue_head_t km_waitq; +extern void km_state_expired(struct xfrm_state *x, int hard); +extern int km_query(struct xfrm_state *x, struct xfrm_tmpl *, struct xfrm_policy *pol); +extern int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport); +extern void km_policy_expired(struct xfrm_policy *pol, int dir, int hard); + +extern void xfrm_input_init(void); +extern int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq); + +extern void xfrm_probe_algs(void); +extern int xfrm_count_auth_supported(void); +extern int xfrm_count_enc_supported(void); +extern struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx); +extern struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx); +extern struct xfrm_algo_desc *xfrm_calg_get_byidx(unsigned int idx); +extern struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id); +extern struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id); +extern struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id); +extern struct xfrm_algo_desc *xfrm_aalg_get_byname(char *name); +extern struct xfrm_algo_desc *xfrm_ealg_get_byname(char *name); +extern struct xfrm_algo_desc *xfrm_calg_get_byname(char *name); + +struct crypto_tfm; +typedef void (icv_update_fn_t)(struct crypto_tfm *, struct scatterlist *, unsigned int); + +extern void skb_icv_walk(const struct sk_buff *skb, struct crypto_tfm *tfm, + int offset, int len, icv_update_fn_t icv_update); + +#endif /* _NET_XFRM_H */ Index: include/net/sctp/compat.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/net/sctp/compat.h,v retrieving revision 1.1.1.7 retrieving revision 1.1.1.7.2.1 diff -u -r1.1.1.7 -r1.1.1.7.2.1 --- a/include/net/sctp/compat.h 14 Apr 2004 13:05:40 -0000 1.1.1.7 +++ b/include/net/sctp/compat.h 16 Apr 2004 13:16:19 -0000 1.1.1.7.2.1 @@ -55,14 +55,10 @@ extern type name[] #define SNMP_DEC_STATS(mib, field) ((mib)[2*smp_processor_id()+!in_softirq()].field--) -#define inet_sk(__sk) (&(((struct sock *)__sk)->protinfo.af_inet)) -#define inet6_sk(__sk) (&(((struct sock *)__sk)->net_pinfo.af_inet6)) - #define virt_addr_valid(x) VALID_PAGE(virt_to_page((x))) #define sock_owned_by_user(sk) ((sk)->lock.users) #define sk_set_owner(x, y) #define __unsafe(x) MOD_INC_USE_COUNT -#define dst_pmtu(x) ((x)->pmtu) #define sk_family family #define sk_state state Index: net/Config.in =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/Config.in,v retrieving revision 1.1.1.22 retrieving revision 1.1.1.22.2.1 diff -u -r1.1.1.22 -r1.1.1.22.2.1 --- a/net/Config.in 28 Nov 2003 18:26:21 -0000 1.1.1.22 +++ b/net/Config.in 16 Apr 2004 13:16:19 -0000 1.1.1.22.2.1 @@ -16,6 +16,7 @@ fi bool 'Socket Filtering' CONFIG_FILTER tristate 'Unix domain sockets' CONFIG_UNIX +tristate 'PF_KEY sockets' CONFIG_NET_KEY bool 'TCP/IP networking' CONFIG_INET if [ "$CONFIG_INET" = "y" ]; then source net/ipv4/Config.in @@ -26,6 +27,29 @@ source net/ipv6/Config.in fi fi + if [ "$CONFIG_NET_KEY" != "n" -o \ + "$CONFIG_NET_IPIP" != "n" -o \ + "$CONFIG_NET_IPGRE" != "n" -o \ + "$CONFIG_INET_AH" != "n" -o \ + "$CONFIG_INET_ESP" != "n" -o \ + "$CONFIG_INET_IPCOMP" != "n" ]; then + define_bool CONFIG_XFRM y + else + if [ "$CONFIG_IPV6" != "n" ]; then + if [ "$CONFIG_INET6_AH" != "n" -o \ + "$CONFIG_INET6_ESP" != "n" -o \ + "$CONFIG_INET6_IPCOMP" != "n" ]; then + define_bool CONFIG_XFRM y + else + bool ' XFRM support' CONFIG_XFRM + fi + else + bool ' XFRM support' CONFIG_XFRM + fi + fi + if [ "$CONFIG_XFRM" = "y" ]; then + source net/xfrm/Config.in + fi if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then source net/khttpd/Config.in fi Index: net/Makefile =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/Makefile,v retrieving revision 1.1.1.22 retrieving revision 1.1.1.22.2.1 diff -u -r1.1.1.22 -r1.1.1.22.2.1 --- a/net/Makefile 14 Apr 2004 13:05:41 -0000 1.1.1.22 +++ b/net/Makefile 16 Apr 2004 13:16:19 -0000 1.1.1.22.2.1 @@ -7,28 +7,23 @@ O_TARGET := network.o -mod-subdirs := ipv4/netfilter ipv6/netfilter ipx irda bluetooth atm netlink sched core sctp +mod-subdirs := ipv4/netfilter ipv6 ipx irda bluetooth atm netlink sched core sctp xfrm export-objs := netsyms.o subdir-y := core ethernet -subdir-m := ipv4 # hum? +subdir-m := ipv4 xfrm # hum? subdir-$(CONFIG_NET) += 802 sched netlink subdir-$(CONFIG_IPV6) += ipv6 -subdir-$(CONFIG_INET) += ipv4 +subdir-$(CONFIG_INET) += ipv4 xfrm subdir-$(CONFIG_NETFILTER) += ipv4/netfilter subdir-$(CONFIG_UNIX) += unix subdir-$(CONFIG_IP_SCTP) += sctp -ifneq ($(CONFIG_IPV6),n) -ifneq ($(CONFIG_IPV6),) -subdir-$(CONFIG_NETFILTER) += ipv6/netfilter -endif -endif - subdir-$(CONFIG_KHTTPD) += khttpd subdir-$(CONFIG_PACKET) += packet +subdir-$(CONFIG_NET_KEY) += key subdir-$(CONFIG_NET_SCHED) += sched subdir-$(CONFIG_BRIDGE) += bridge subdir-$(CONFIG_IPX) += ipx Index: net/netsyms.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/netsyms.c,v retrieving revision 1.1.1.32 retrieving revision 1.1.1.32.2.1 diff -u -r1.1.1.32 -r1.1.1.32.2.1 --- a/net/netsyms.c 14 Apr 2004 13:05:41 -0000 1.1.1.32 +++ b/net/netsyms.c 16 Apr 2004 13:16:19 -0000 1.1.1.32.2.1 @@ -58,6 +58,12 @@ #include #include #include +#if defined(CONFIG_INET_AH) || defined(CONFIG_INET_AH_MODULE) || defined(CONFIG_INET6_AH) || defined(CONFIG_INET6_AH_MODULE) +#include +#endif +#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE) +#include +#endif extern struct net_proto_family inet_family_ops; @@ -188,6 +194,7 @@ #endif #ifdef CONFIG_SYSCTL EXPORT_SYMBOL(neigh_sysctl_register); +EXPORT_SYMBOL(neigh_sysctl_unregister); #endif EXPORT_SYMBOL(pneigh_lookup); EXPORT_SYMBOL(pneigh_enqueue); @@ -284,6 +291,7 @@ EXPORT_SYMBOL(inetdev_by_index); EXPORT_SYMBOL(in_dev_finish_destroy); EXPORT_SYMBOL(ip_defrag); +EXPORT_SYMBOL(inet_peer_idlock); /* Route manipulation */ EXPORT_SYMBOL(ip_rt_ioctl); @@ -299,6 +307,14 @@ EXPORT_SYMBOL(dlci_ioctl_hook); #endif +#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE) +EXPORT_SYMBOL_GPL(skb_cow_data); +EXPORT_SYMBOL_GPL(pskb_put); +EXPORT_SYMBOL_GPL(skb_to_sgvec); +#endif + +EXPORT_SYMBOL(flow_cache_lookup); +EXPORT_SYMBOL(flow_cache_genid); #if defined (CONFIG_IPV6_MODULE) || defined (CONFIG_KHTTPD) || defined (CONFIG_KHTTPD_MODULE) || defined (CONFIG_IP_SCTP_MODULE) /* inet functions common to v4 and v6 */ @@ -412,8 +428,9 @@ EXPORT_SYMBOL(secure_ipv6_id); #endif -#endif +EXPORT_SYMBOL(ip_generic_getfrag); +#endif EXPORT_SYMBOL(tcp_read_sock); #ifdef CONFIG_IP_SCTP_MODULE @@ -490,6 +507,7 @@ EXPORT_SYMBOL(loopback_dev); EXPORT_SYMBOL(register_netdevice); EXPORT_SYMBOL(unregister_netdevice); +EXPORT_SYMBOL(synchronize_net); EXPORT_SYMBOL(netdev_state_change); EXPORT_SYMBOL(dev_new_index); EXPORT_SYMBOL(dev_get_by_flags); Index: net/atm/clip.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/atm/clip.c,v retrieving revision 1.1.1.17 retrieving revision 1.1.1.17.2.1 diff -u -r1.1.1.17 -r1.1.1.17.2.1 --- a/net/atm/clip.c 18 Feb 2004 13:36:32 -0000 1.1.1.17 +++ b/net/atm/clip.c 16 Apr 2004 13:16:19 -0000 1.1.1.17.2.1 @@ -521,6 +521,7 @@ struct atmarp_entry *entry; int error; struct clip_vcc *clip_vcc; + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, .tos = 1 } } }; struct rtable *rt; if (vcc->push != clip_push) { @@ -537,7 +538,7 @@ unlink_clip_vcc(clip_vcc); return 0; } - error = ip_route_output(&rt,ip,0,1,0); + error = ip_route_output_key(&rt,&fl); if (error) return error; neigh = __neigh_lookup(&clip_tbl,&ip,rt->u.dst.dev,1); ip_rt_put(rt); Index: net/core/Makefile =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/core/Makefile,v retrieving revision 1.1.1.18 retrieving revision 1.1.1.18.2.1 diff -u -r1.1.1.18 -r1.1.1.18.2.1 --- a/net/core/Makefile 28 Nov 2003 18:26:21 -0000 1.1.1.18 +++ b/net/core/Makefile 16 Apr 2004 13:16:19 -0000 1.1.1.18.2.1 @@ -21,8 +21,8 @@ obj-$(CONFIG_FILTER) += filter.o -obj-$(CONFIG_NET) += dev.o ethtool.o dev_mcast.o dst.o neighbour.o \ - rtnetlink.o utils.o +obj-$(CONFIG_NET) += flow.o dev.o ethtool.o dev_mcast.o dst.o \ + neighbour.o rtnetlink.o utils.o obj-$(CONFIG_NETFILTER) += netfilter.o obj-$(CONFIG_NET_DIVERT) += dv.o Index: net/core/dev.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/core/dev.c,v retrieving revision 1.1.1.30 retrieving revision 1.1.1.30.2.1 diff -u -r1.1.1.30 -r1.1.1.30.2.1 --- a/net/core/dev.c 14 Apr 2004 13:05:41 -0000 1.1.1.30 +++ b/net/core/dev.c 16 Apr 2004 13:16:19 -0000 1.1.1.30.2.1 @@ -912,6 +912,13 @@ return notifier_chain_register(&netdev_chain, nb); } +/* Synchronize with packet receive processing. */ +void synchronize_net(void) +{ + br_write_lock_bh(BR_NETPROTO_LOCK); + br_write_unlock_bh(BR_NETPROTO_LOCK); +} + /** * unregister_netdevice_notifier - unregister a network notifier block * @nb: notifier Index: net/core/dst.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/core/dst.c,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/net/core/dst.c 25 Aug 2003 11:44:44 -0000 1.1.1.15 +++ b/net/core/dst.c 16 Apr 2004 13:16:20 -0000 1.1.1.15.2.1 @@ -36,11 +36,11 @@ static unsigned long dst_gc_timer_expires; static unsigned long dst_gc_timer_inc = DST_GC_MAX; static void dst_run_gc(unsigned long); +static void ___dst_free(struct dst_entry * dst); static struct timer_list dst_gc_timer = { data: DST_GC_MIN, function: dst_run_gc }; - static void dst_run_gc(unsigned long dummy) { int delayed = 0; @@ -61,7 +61,25 @@ continue; } *dstp = dst->next; - dst_destroy(dst); + + dst = dst_destroy(dst); + if (dst) { + /* NOHASH and still referenced. Unless it is already + * on gc list, invalidate it and add to gc list. + * + * Note: this is temporary. Actually, NOHASH dst's + * must be obsoleted when parent is obsoleted. + * But we do not have state "obsoleted, but + * referenced by parent", so it is right. + */ + if (dst->obsolete > 1) + continue; + + ___dst_free(dst); + dst->next = *dstp; + *dstp = dst; + dstp = &dst->next; + } } if (!dst_garbage_list) { dst_gc_timer_inc = DST_GC_MAX; @@ -108,6 +126,7 @@ atomic_set(&dst->__refcnt, 0); dst->ops = ops; dst->lastuse = jiffies; + dst->path = dst; dst->input = dst_discard; dst->output = dst_blackhole; #if RT_CACHE_DEBUG >= 2 @@ -117,10 +136,8 @@ return dst; } -void __dst_free(struct dst_entry * dst) +static void ___dst_free(struct dst_entry * dst) { - spin_lock_bh(&dst_lock); - /* The first case (dev==NULL) is required, when protocol module is unloaded. */ @@ -129,6 +146,12 @@ dst->output = dst_blackhole; } dst->obsolete = 2; +} + +void __dst_free(struct dst_entry * dst) +{ + spin_lock_bh(&dst_lock); + ___dst_free(dst); dst->next = dst_garbage_list; dst_garbage_list = dst; if (dst_gc_timer_inc > DST_GC_INC) { @@ -136,14 +159,19 @@ dst_gc_timer_expires = DST_GC_MIN; mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires); } - spin_unlock_bh(&dst_lock); } -void dst_destroy(struct dst_entry * dst) +struct dst_entry *dst_destroy(struct dst_entry * dst) { - struct neighbour *neigh = dst->neighbour; - struct hh_cache *hh = dst->hh; + struct dst_entry *child; + struct neighbour *neigh; + struct hh_cache *hh; + +again: + neigh = dst->neighbour; + hh = dst->hh; + child = dst->child; dst->hh = NULL; if (hh && atomic_dec_and_test(&hh->hh_refcnt)) @@ -164,6 +192,21 @@ atomic_dec(&dst_total); #endif kmem_cache_free(dst->ops->kmem_cachep, dst); + + dst = child; + if (dst) { + if (atomic_dec_and_test(&dst->__refcnt)) { + /* We were real parent of this dst, so kill child. */ + if (dst->flags&DST_NOHASH) + goto again; + } else { + /* Child is still referenced, return it for freeing. */ + if (dst->flags&DST_NOHASH) + return dst; + /* Child is still in his hash table */ + } + } + return NULL; } static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) Index: net/core/flow.c =================================================================== RCS file: net/core/flow.c diff -N net/core/flow.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/core/flow.c 16 Apr 2004 13:16:20 -0000 1.5.18.1 @@ -0,0 +1,322 @@ +/* flow.c: Generic flow cache. + * + * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru) + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct flow_cache_entry { + struct flow_cache_entry *next; + u16 family; + u8 dir; + struct flowi key; + u32 genid; + void *object; + atomic_t *object_ref; +}; + +atomic_t flow_cache_genid = ATOMIC_INIT(0); + +static u32 flow_hash_shift; +#define flow_hash_size (1 << flow_hash_shift) +static struct flow_cache_entry **flow_table; +static kmem_cache_t *flow_cachep; + +static int flow_lwm, flow_hwm; + +struct flow_percpu_info { + int hash_rnd_recalc; + u32 hash_rnd; + int count; +} ____cacheline_aligned; +static struct flow_percpu_info flow_hash_info[NR_CPUS]; + +#define flow_hash_rnd_recalc(cpu) (flow_hash_info[cpu].hash_rnd_recalc) +#define flow_hash_rnd(cpu) (flow_hash_info[cpu].hash_rnd) +#define flow_count(cpu) (flow_hash_info[cpu].count) + +static struct timer_list flow_hash_rnd_timer; + +#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) + +struct flow_flush_info { + atomic_t cpuleft; + struct completion completion; +}; +static struct tasklet_struct flow_flush_tasklets[NR_CPUS]; +static DECLARE_MUTEX(flow_flush_sem); + +static void flow_cache_new_hashrnd(unsigned long arg) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + flow_hash_rnd_recalc(i) = 1; + + flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&flow_hash_rnd_timer); +} + +static void __flow_cache_shrink(int cpu, int shrink_to) +{ + struct flow_cache_entry *fle, **flp; + int i; + + for (i = 0; i < flow_hash_size; i++) { + int k = 0; + + flp = &flow_table[cpu*flow_hash_size+i]; + while ((fle = *flp) != NULL && k < shrink_to) { + k++; + flp = &fle->next; + } + while ((fle = *flp) != NULL) { + *flp = fle->next; + if (fle->object) + atomic_dec(fle->object_ref); + kmem_cache_free(flow_cachep, fle); + flow_count(cpu)--; + } + } +} + +static void flow_cache_shrink(int cpu) +{ + int shrink_to = flow_lwm / flow_hash_size; + + __flow_cache_shrink(cpu, shrink_to); +} + +static void flow_new_hash_rnd(int cpu) +{ + get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32)); + flow_hash_rnd_recalc(cpu) = 0; + + __flow_cache_shrink(cpu, 0); +} + +static u32 flow_hash_code(struct flowi *key, int cpu) +{ + u32 *k = (u32 *) key; + + return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) & + (flow_hash_size - 1)); +} + +#if (BITS_PER_LONG == 64) +typedef u64 flow_compare_t; +#else +typedef u32 flow_compare_t; +#endif + +extern void flowi_is_missized(void); + +/* I hear what you're saying, use memcmp. But memcmp cannot make + * important assumptions that we can here, such as alignment and + * constant size. + */ +static int flow_key_compare(struct flowi *key1, struct flowi *key2) +{ + flow_compare_t *k1, *k1_lim, *k2; + const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t); + + if (sizeof(struct flowi) % sizeof(flow_compare_t)) + flowi_is_missized(); + + k1 = (flow_compare_t *) key1; + k1_lim = k1 + n_elem; + + k2 = (flow_compare_t *) key2; + + do { + if (*k1++ != *k2++) + return 1; + } while (k1 < k1_lim); + + return 0; +} + +void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, + flow_resolve_t resolver) +{ + struct flow_cache_entry *fle, **head; + unsigned int hash; + int cpu; + + local_bh_disable(); + cpu = smp_processor_id(); + if (flow_hash_rnd_recalc(cpu)) + flow_new_hash_rnd(cpu); + hash = flow_hash_code(key, cpu); + + head = &flow_table[(cpu << flow_hash_shift) + hash]; + for (fle = *head; fle; fle = fle->next) { + if (fle->family == family && + fle->dir == dir && + flow_key_compare(key, &fle->key) == 0) { + if (fle->genid == atomic_read(&flow_cache_genid)) { + void *ret = fle->object; + + if (ret) + atomic_inc(fle->object_ref); + local_bh_enable(); + + return ret; + } + break; + } + } + + if (!fle) { + if (flow_count(cpu) > flow_hwm) + flow_cache_shrink(cpu); + + fle = kmem_cache_alloc(flow_cachep, SLAB_ATOMIC); + if (fle) { + fle->next = *head; + *head = fle; + fle->family = family; + fle->dir = dir; + memcpy(&fle->key, key, sizeof(*key)); + fle->object = NULL; + flow_count(cpu)++; + } + } + + { + void *obj; + atomic_t *obj_ref; + + resolver(key, family, dir, &obj, &obj_ref); + + if (fle) { + fle->genid = atomic_read(&flow_cache_genid); + + if (fle->object) + atomic_dec(fle->object_ref); + + fle->object = obj; + fle->object_ref = obj_ref; + if (obj) + atomic_inc(fle->object_ref); + } + local_bh_enable(); + + return obj; + } +} + +static void flow_cache_flush_tasklet(unsigned long data) +{ + struct flow_flush_info *info = (void *)data; + int i; + int cpu; + + cpu = smp_processor_id(); + for (i = 0; i < flow_hash_size; i++) { + struct flow_cache_entry *fle; + + fle = flow_table[(cpu << flow_hash_shift) + i]; + for (; fle; fle = fle->next) { + unsigned genid = atomic_read(&flow_cache_genid); + + if (!fle->object || fle->genid == genid) + continue; + + fle->object = NULL; + atomic_dec(fle->object_ref); + } + } + + if (atomic_dec_and_test(&info->cpuleft)) + complete(&info->completion); +} + +static void flow_cache_flush_per_cpu(void *data) +{ + struct flow_flush_info *info = data; + int cpu; + struct tasklet_struct *tasklet; + + cpu = smp_processor_id(); + tasklet = &flow_flush_tasklets[cpu]; + tasklet_init(tasklet, flow_cache_flush_tasklet, (unsigned long)info); + tasklet_schedule(tasklet); +} + +void flow_cache_flush(void) +{ + struct flow_flush_info info; + + atomic_set(&info.cpuleft, smp_num_cpus); + init_completion(&info.completion); + + down(&flow_flush_sem); + + local_bh_disable(); + smp_call_function(flow_cache_flush_per_cpu, &info, 1, 0); + flow_cache_flush_per_cpu(&info); + local_bh_enable(); + + wait_for_completion(&info.completion); + + up(&flow_flush_sem); +} + +static int __init flow_cache_init(void) +{ + unsigned long order; + int i; + + flow_cachep = kmem_cache_create("flow_cache", + sizeof(struct flow_cache_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + if (!flow_cachep) + panic("NET: failed to allocate flow cache slab\n"); + + flow_hash_shift = 10; + flow_lwm = 2 * flow_hash_size; + flow_hwm = 4 * flow_hash_size; + + for (i = 0; i < NR_CPUS; i++) + flow_hash_rnd_recalc(i) = 1; + + init_timer(&flow_hash_rnd_timer); + flow_hash_rnd_timer.function = flow_cache_new_hashrnd; + flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&flow_hash_rnd_timer); + + for (order = 0; + (PAGE_SIZE << order) < + (NR_CPUS*sizeof(struct flow_entry *)*flow_hash_size); + order++) + /* NOTHING */; + + flow_table = (struct flow_cache_entry **) + __get_free_pages(GFP_ATOMIC, order); + + if (!flow_table) + panic("Failed to allocate flow cache hash table\n"); + + memset(flow_table, 0, PAGE_SIZE << order); + + return 0; +} + +module_init(flow_cache_init); Index: net/core/neighbour.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/core/neighbour.c,v retrieving revision 1.1.1.19 retrieving revision 1.1.1.19.2.1 diff -u -r1.1.1.19 -r1.1.1.19.2.1 --- a/net/core/neighbour.c 18 Feb 2004 13:36:32 -0000 1.1.1.19 +++ b/net/core/neighbour.c 16 Apr 2004 13:16:20 -0000 1.1.1.19.2.1 @@ -638,7 +638,9 @@ static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; - return p->ucast_probes + p->app_probes + p->mcast_probes; + return (n->nud_state & NUD_PROBE ? + p->ucast_probes : + p->ucast_probes + p->app_probes + p->mcast_probes); } @@ -1117,9 +1119,6 @@ if (*p == parms) { *p = parms->next; write_unlock_bh(&tbl->lock); -#ifdef CONFIG_SYSCTL - neigh_sysctl_unregister(parms); -#endif kfree(parms); return; } @@ -1184,9 +1183,6 @@ } } write_unlock(&neigh_tbl_lock); -#ifdef CONFIG_SYSCTL - neigh_sysctl_unregister(&tbl->parms); -#endif return 0; } Index: net/core/netfilter.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/core/netfilter.c,v retrieving revision 1.1.1.20 retrieving revision 1.1.1.20.2.1 diff -u -r1.1.1.20 -r1.1.1.20.2.1 --- a/net/core/netfilter.c 25 Aug 2003 11:44:44 -0000 1.1.1.20 +++ b/net/core/netfilter.c 16 Apr 2004 13:16:20 -0000 1.1.1.20.2.1 @@ -563,7 +563,7 @@ { struct iphdr *iph = (*pskb)->nh.iph; struct rtable *rt; - struct rt_key key = {}; + struct flowi fl = {}; struct dst_entry *odst; unsigned int hh_len; @@ -571,14 +571,15 @@ * packets with foreign saddr to be appear on the NF_IP_LOCAL_OUT hook. */ if (inet_addr_type(iph->saddr) == RTN_LOCAL) { - key.dst = iph->daddr; - key.src = iph->saddr; - key.oif = (*pskb)->sk ? (*pskb)->sk->bound_dev_if : 0; - key.tos = RT_TOS(iph->tos); + fl.nl_u.ip4_u.daddr = iph->daddr; + fl.nl_u.ip4_u.saddr = iph->saddr; + fl.oif = (*pskb)->sk ? (*pskb)->sk->bound_dev_if : 0; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); #ifdef CONFIG_IP_ROUTE_FWMARK - key.fwmark = (*pskb)->nfmark; + fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; #endif - if (ip_route_output_key(&rt, &key) != 0) + fl.proto = iph->protocol; + if (ip_route_output_key(&rt, &fl) != 0) return -1; /* Drop old route. */ @@ -587,8 +588,8 @@ } else { /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ - key.dst = iph->saddr; - if (ip_route_output_key(&rt, &key) != 0) + fl.nl_u.ip4_u.daddr = iph->saddr; + if (ip_route_output_key(&rt, &fl) != 0) return -1; odst = (*pskb)->dst; Index: net/core/rtnetlink.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/core/rtnetlink.c,v retrieving revision 1.1.1.18 retrieving revision 1.1.1.18.2.1 diff -u -r1.1.1.18 -r1.1.1.18.2.1 --- a/net/core/rtnetlink.c 25 Aug 2003 11:44:44 -0000 1.1.1.18 +++ b/net/core/rtnetlink.c 16 Apr 2004 13:16:20 -0000 1.1.1.18.2.1 @@ -128,7 +128,7 @@ return err; } -int rtnetlink_put_metrics(struct sk_buff *skb, unsigned *metrics) +int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) { struct rtattr *mx = (struct rtattr*)skb->tail; int i; @@ -136,7 +136,7 @@ RTA_PUT(skb, RTA_METRICS, 0, NULL); for (i=0; irta_len = skb->tail - (u8*)mx; if (mx->rta_len == RTA_LENGTH(0)) Index: net/core/skbuff.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/core/skbuff.c,v retrieving revision 1.1.1.22 retrieving revision 1.1.1.22.2.1 diff -u -r1.1.1.22 -r1.1.1.22.2.1 --- a/net/core/skbuff.c 25 Aug 2003 11:44:44 -0000 1.1.1.22 +++ b/net/core/skbuff.c 16 Apr 2004 13:16:20 -0000 1.1.1.22.2.1 @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -201,6 +202,7 @@ /* Set up other state */ skb->len = 0; + skb->local_df = 0; skb->cloned = 0; skb->data_len = 0; @@ -233,6 +235,7 @@ skb->dev = NULL; skb->real_dev = NULL; skb->dst = NULL; + skb->sp = NULL; memset(skb->cb, 0, sizeof(skb->cb)); skb->pkt_type = PACKET_HOST; /* Default type */ skb->ip_summed = 0; @@ -317,6 +320,9 @@ } dst_release(skb->dst); +#ifdef CONFIG_XFRM + secpath_put(skb->sp); +#endif if(skb->destructor) { if (in_irq()) { printk(KERN_WARNING "Warning: kfree_skb on hard IRQ %p\n", @@ -369,10 +375,15 @@ C(mac); C(dst); dst_clone(n->dst); + C(sp); +#ifdef CONFIG_INET + secpath_get(n->sp); +#endif memcpy(n->cb, skb->cb, sizeof(skb->cb)); C(len); C(data_len); C(csum); + C(local_df); n->cloned = 1; C(pkt_type); C(ip_summed); @@ -423,11 +434,15 @@ new->priority=old->priority; new->protocol=old->protocol; new->dst=dst_clone(old->dst); +#ifdef CONFIG_INET + new->sp=secpath_get(old->sp); +#endif new->h.raw=old->h.raw+offset; new->nh.raw=old->nh.raw+offset; new->mac.raw=old->mac.raw+offset; memcpy(new->cb, old->cb, sizeof(old->cb)); atomic_set(&new->users, 1); + new->local_df=old->local_df; new->pkt_type=old->pkt_type; new->stamp=old->stamp; new->destructor = NULL; Index: net/decnet/dn_nsp_out.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/decnet/dn_nsp_out.c,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/net/decnet/dn_nsp_out.c 22 Jan 2001 21:32:10 -0000 1.1.1.15 +++ b/net/decnet/dn_nsp_out.c 16 Apr 2004 13:16:20 -0000 1.1.1.15.2.1 @@ -593,7 +593,7 @@ * associations. */ skb->dst = dst_clone(dst); - skb->dst->output(skb); + dst_output(skb); } Index: net/decnet/dn_route.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/decnet/dn_route.c,v retrieving revision 1.1.1.21 retrieving revision 1.1.1.21.2.1 diff -u -r1.1.1.21 -r1.1.1.21.2.1 --- a/net/decnet/dn_route.c 28 Nov 2002 23:53:15 -0000 1.1.1.21 +++ b/net/decnet/dn_route.c 16 Apr 2004 13:16:20 -0000 1.1.1.21.2.1 @@ -100,7 +100,6 @@ static int dn_dst_gc(void); static struct dst_entry *dn_dst_check(struct dst_entry *, __u32); -static struct dst_entry *dn_dst_reroute(struct dst_entry *, struct sk_buff *skb); static struct dst_entry *dn_dst_negative_advice(struct dst_entry *); static void dn_dst_link_failure(struct sk_buff *); static int dn_route_input(struct sk_buff *); @@ -119,7 +118,6 @@ gc_thresh: 128, gc: dn_dst_gc, check: dn_dst_check, - reroute: dn_dst_reroute, negative_advice: dn_dst_negative_advice, link_failure: dn_dst_link_failure, entry_size: sizeof(struct dn_route), @@ -202,12 +200,6 @@ return NULL; } -static struct dst_entry *dn_dst_reroute(struct dst_entry *dst, - struct sk_buff *skb) -{ - return NULL; -} - /* * This is called through sendmsg() when you specify MSG_TRYHARD * and there is already a route in cache. @@ -396,7 +388,7 @@ int err; if ((err = dn_route_input(skb)) == 0) - return skb->dst->input(skb); + return dst_input(skb); if (decnet_debug_level & 4) { char *devname = skb->dev ? skb->dev->name : "???"; @@ -1049,10 +1041,12 @@ RTA_PUT(skb, RTA_SRC, 2, &rt->rt_saddr); if (rt->u.dst.dev) RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); - if (rt->u.dst.window) - RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window); - if (rt->u.dst.rtt) - RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt); + if (dst_metric(&rt->u.dst, RTAX_WINDOW)) + RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), + &rt->u.dst.metrics[RTAX_WINDOW - 1]); + if (dst_metric(&rt->u.dst, RTAX_RTT)) + RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), + &rt->u.dst.metrics[RTAX_RTT]); nlh->nlmsg_len = skb->tail - b; return skb->len; @@ -1208,7 +1202,7 @@ dn_addr2asc(dn_ntohs(rt->rt_saddr), buf2), atomic_read(&rt->u.dst.__refcnt), rt->u.dst.__use, - (int)rt->u.dst.rtt + (int) dst_metric(&rt->u.dst, RTAX_RTT) ); Index: net/ipv4/Config.in =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/Config.in,v retrieving revision 1.1.1.17 retrieving revision 1.1.1.17.2.1 diff -u -r1.1.1.17 -r1.1.1.17.2.1 --- a/net/ipv4/Config.in 28 Nov 2003 18:26:21 -0000 1.1.1.17 +++ b/net/ipv4/Config.in 16 Apr 2004 13:16:20 -0000 1.1.1.17.2.1 @@ -40,6 +40,9 @@ fi bool ' IP: TCP Explicit Congestion Notification support' CONFIG_INET_ECN bool ' IP: TCP syncookie support (disabled per default)' CONFIG_SYN_COOKIES +tristate ' IP: AH transformation' CONFIG_INET_AH +tristate ' IP: ESP transformation' CONFIG_INET_ESP +tristate ' IP: IPComp transformation' CONFIG_INET_IPCOMP if [ "$CONFIG_NETFILTER" != "n" ]; then source net/ipv4/netfilter/Config.in fi Index: net/ipv4/Makefile =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/Makefile,v retrieving revision 1.1.1.18 retrieving revision 1.1.1.18.2.1 diff -u -r1.1.1.18 -r1.1.1.18.2.1 --- a/net/ipv4/Makefile 21 Dec 2001 17:42:05 -0000 1.1.1.18 +++ b/net/ipv4/Makefile 16 Apr 2004 13:16:21 -0000 1.1.1.18.2.1 @@ -24,6 +24,11 @@ obj-$(CONFIG_NET_IPIP) += ipip.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o +obj-$(CONFIG_INET_AH) += ah4.o +obj-$(CONFIG_INET_ESP) += esp4.o +obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_IP_PNP) += ipconfig.o +obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o xfrm4_tunnel.o + include $(TOPDIR)/Rules.make Index: net/ipv4/af_inet.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/af_inet.c,v retrieving revision 1.1.1.24 retrieving revision 1.1.1.24.2.1 diff -u -r1.1.1.24 -r1.1.1.24.2.1 --- a/net/ipv4/af_inet.c 28 Nov 2003 18:26:21 -0000 1.1.1.24 +++ b/net/ipv4/af_inet.c 16 Apr 2004 13:16:21 -0000 1.1.1.24.2.1 @@ -89,6 +89,7 @@ #include #include +#include #include #include #include @@ -103,6 +104,7 @@ #include #include #include +#include #ifdef CONFIG_IP_MROUTE #include #endif @@ -213,6 +215,8 @@ sock_orphan(sk); + xfrm_sk_free_policy(sk); + #ifdef INET_REFCNT_DEBUG if (atomic_read(&sk->refcnt) != 1) { printk(KERN_DEBUG "Destruction inet %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt)); @@ -386,7 +390,7 @@ sk->backlog_rcv = sk->prot->backlog_rcv; - sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl; + sk->protinfo.af_inet.uc_ttl = -1; sk->protinfo.af_inet.mc_loop = 1; sk->protinfo.af_inet.mc_ttl = 1; @@ -698,6 +702,27 @@ return err; } +#ifdef CONFIG_IP_MULTICAST +static struct inet_protocol igmp_protocol = { + .handler = igmp_rcv, +}; +#endif + +static struct inet_protocol tcp_protocol = { + .handler = tcp_v4_rcv, + .err_handler = tcp_v4_err, + .no_policy = 1, +}; + +static struct inet_protocol udp_protocol = { + .handler = udp_rcv, + .err_handler = udp_err, + .no_policy = 1, +}; + +static struct inet_protocol icmp_protocol = { + .handler = icmp_rcv, +}; /* * This does both peername and sockname. @@ -724,6 +749,7 @@ sin->sin_port = sk->sport; sin->sin_addr.s_addr = addr; } + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *uaddr_len = sizeof(*sin); return(0); } @@ -757,6 +783,21 @@ return sk->prot->sendmsg(sk, msg, size); } + +ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +{ + struct sock *sk = sock->sk; + + /* We may need to bind the socket. */ + if (!sk->num && inet_autobind(sk)) + return -EAGAIN; + + if (sk->prot->sendpage) + return sk->prot->sendpage(sk, page, offset, size, flags); + return sock_no_sendpage(sock, page, offset, size, flags); +} + + int inet_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; @@ -981,7 +1022,7 @@ sendmsg: inet_sendmsg, recvmsg: inet_recvmsg, mmap: sock_no_mmap, - sendpage: sock_no_sendpage, + sendpage: inet_sendpage, }; struct net_proto_family inet_family_ops = { @@ -1109,7 +1150,6 @@ static int __init inet_init(void) { struct sk_buff *dummy_skb; - struct inet_protocol *p; struct inet_protosw *q; struct list_head *r; @@ -1127,16 +1167,19 @@ (void) sock_register(&inet_family_ops); /* - * Add all the protocols. + * Add all the base protocols. */ - printk(KERN_INFO "IP Protocols: "); - for (p = inet_protocol_base; p != NULL;) { - struct inet_protocol *tmp = (struct inet_protocol *) p->next; - inet_add_protocol(p); - printk("%s%s",p->name,tmp?", ":"\n"); - p = tmp; - } + if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) + printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n"); + if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) + printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n"); + if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) + printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n"); +#ifdef CONFIG_IP_MULTICAST + if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) + printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n"); +#endif /* Register the socket-side information for inet_create. */ for(r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) Index: net/ipv4/ah4.c =================================================================== RCS file: net/ipv4/ah4.c diff -N net/ipv4/ah4.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv4/ah4.c 16 Apr 2004 13:16:21 -0000 1.6.2.1 @@ -0,0 +1,377 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Clear mutable options and find final destination to substitute + * into IP header for icv calculation. Options are already checked + * for validity, so paranoia is not required. */ + +static int ip_clear_mutable_options(struct iphdr *iph, u32 *daddr) +{ + unsigned char * optptr = (unsigned char*)(iph+1); + int l = iph->ihl*4 - sizeof(struct iphdr); + int optlen; + + while (l > 0) { + switch (*optptr) { + case IPOPT_END: + return 0; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) + return -EINVAL; + switch (*optptr) { + case IPOPT_SEC: + case 0x85: /* Some "Extended Security" crap. */ + case 0x86: /* Another "Commercial Security" crap. */ + case IPOPT_RA: + case 0x80|21: /* RFC1770 */ + break; + case IPOPT_LSRR: + case IPOPT_SSRR: + if (optlen < 6) + return -EINVAL; + memcpy(daddr, optptr+optlen-4, 4); + /* Fall through */ + default: + memset(optptr+2, 0, optlen-2); + } + l -= optlen; + optptr += optlen; + } + return 0; +} + +static int ah_output(struct sk_buff *skb) +{ + int err; + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct iphdr *iph, *top_iph; + struct ip_auth_hdr *ah; + struct ah_data *ahp; + union { + struct iphdr iph; + char buf[60]; + } tmp_iph; + + if (skb->ip_summed == CHECKSUM_HW && skb_checksum_help(skb) == NULL) { + err = -EINVAL; + goto error_nolock; + } + + spin_lock_bh(&x->lock); + err = xfrm_check_output(x, skb, AF_INET); + if (err) + goto error; + + iph = skb->nh.iph; + if (x->props.mode) { + top_iph = (struct iphdr*)skb_push(skb, x->props.header_len); + top_iph->ihl = 5; + top_iph->version = 4; + top_iph->tos = 0; + top_iph->tot_len = htons(skb->len); + if (!(iph->frag_off&htons(IP_DF))) { +#ifdef NETIF_F_TSO + __ip_select_ident(top_iph, dst, 0); +#else + __ip_select_ident(top_iph, dst); +#endif + } + top_iph->frag_off = 0; + top_iph->ttl = 0; + top_iph->protocol = IPPROTO_AH; + top_iph->check = 0; + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + ah = (struct ip_auth_hdr*)(top_iph+1); + ah->nexthdr = IPPROTO_IPIP; + } else { + memcpy(&tmp_iph, skb->data, iph->ihl*4); + top_iph = (struct iphdr*)skb_push(skb, x->props.header_len); + memcpy(top_iph, &tmp_iph, iph->ihl*4); + iph = &tmp_iph.iph; + top_iph->tos = 0; + top_iph->tot_len = htons(skb->len); + top_iph->frag_off = 0; + top_iph->ttl = 0; + top_iph->protocol = IPPROTO_AH; + top_iph->check = 0; + if (top_iph->ihl != 5) { + err = ip_clear_mutable_options(top_iph, &top_iph->daddr); + if (err) + goto error; + } + ah = (struct ip_auth_hdr*)((char*)top_iph+iph->ihl*4); + ah->nexthdr = iph->protocol; + } + ahp = x->data; + ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len) >> 2) - 2; + + ah->reserved = 0; + ah->spi = x->id.spi; + ah->seq_no = htonl(++x->replay.oseq); + ahp->icv(ahp, skb, ah->auth_data); + top_iph->tos = iph->tos; + top_iph->ttl = iph->ttl; + if (x->props.mode) { + if (x->props.flags & XFRM_STATE_NOECN) + IP_ECN_clear(top_iph); + top_iph->frag_off = iph->frag_off&~htons(IP_MF|IP_OFFSET); + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + } else { + top_iph->frag_off = iph->frag_off; + top_iph->daddr = iph->daddr; + if (iph->ihl != 5) + memcpy(top_iph+1, iph+1, iph->ihl*4 - sizeof(struct iphdr)); + } + ip_send_check(top_iph); + + skb->nh.raw = skb->data; + + x->curlft.bytes += skb->len; + x->curlft.packets++; + spin_unlock_bh(&x->lock); + if ((skb->dst = dst_pop(dst)) == NULL) { + err = -EHOSTUNREACH; + goto error_nolock; + } + return NET_XMIT_BYPASS; + +error: + spin_unlock_bh(&x->lock); +error_nolock: + kfree_skb(skb); + return err; +} + +int ah_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + int ah_hlen; + struct iphdr *iph; + struct ip_auth_hdr *ah; + struct ah_data *ahp; + char work_buf[60]; + + if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) + goto out; + + ah = (struct ip_auth_hdr*)skb->data; + ahp = x->data; + ah_hlen = (ah->hdrlen + 2) << 2; + + if (ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len)) + goto out; + + if (!pskb_may_pull(skb, ah_hlen)) + goto out; + + /* We are going to _remove_ AH header to keep sockets happy, + * so... Later this can change. */ + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto out; + + skb->ip_summed = CHECKSUM_NONE; + + ah = (struct ip_auth_hdr*)skb->data; + iph = skb->nh.iph; + + memcpy(work_buf, iph, iph->ihl*4); + + iph->ttl = 0; + iph->tos = 0; + iph->frag_off = 0; + iph->check = 0; + if (iph->ihl != 5) { + u32 dummy; + if (ip_clear_mutable_options(iph, &dummy)) + goto out; + } + { + u8 auth_data[MAX_AH_AUTH_LEN]; + + memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); + skb_push(skb, skb->data - skb->nh.raw); + ahp->icv(ahp, skb, ah->auth_data); + if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { + x->stats.integrity_failed++; + goto out; + } + } + ((struct iphdr*)work_buf)->protocol = ah->nexthdr; + skb->nh.raw = skb_pull(skb, ah_hlen); + memcpy(skb->nh.raw, work_buf, iph->ihl*4); + skb->nh.iph->tot_len = htons(skb->len); + skb_pull(skb, skb->nh.iph->ihl*4); + skb->h.raw = skb->data; + + return 0; + +out: + return -EINVAL; +} + +void ah4_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr*)skb->data; + struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2)); + struct xfrm_state *x; + + if (skb->h.icmph->type != ICMP_DEST_UNREACH || + skb->h.icmph->code != ICMP_FRAG_NEEDED) + return; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); + if (!x) + return; + printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", + ntohl(ah->spi), ntohl(iph->daddr)); + xfrm_state_put(x); +} + +static int ah_init_state(struct xfrm_state *x, void *args) +{ + struct ah_data *ahp = NULL; + struct xfrm_algo_desc *aalg_desc; + + if (!x->aalg) + goto error; + + /* null auth can use a zero length key */ + if (x->aalg->alg_key_len > 512) + goto error; + + ahp = kmalloc(sizeof(*ahp), GFP_KERNEL); + if (ahp == NULL) + return -ENOMEM; + + memset(ahp, 0, sizeof(*ahp)); + + ahp->key = x->aalg->alg_key; + ahp->key_len = (x->aalg->alg_key_len+7)/8; + ahp->tfm = crypto_alloc_tfm(x->aalg->alg_name, 0); + if (!ahp->tfm) + goto error; + ahp->icv = ah_hmac_digest; + + /* + * Lookup the algorithm description maintained by xfrm_algo, + * verify crypto transform properties, and store information + * we need for AH processing. This lookup cannot fail here + * after a successful crypto_alloc_tfm(). + */ + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_tfm_alg_digestsize(ahp->tfm)) { + printk(KERN_INFO "AH: %s digestsize %u != %hu\n", + x->aalg->alg_name, crypto_tfm_alg_digestsize(ahp->tfm), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto error; + } + + ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + + BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); + + ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL); + if (!ahp->work_icv) + goto error; + + x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); + if (x->props.mode) + x->props.header_len += sizeof(struct iphdr); + x->data = ahp; + + return 0; + +error: + if (ahp) { + if (ahp->work_icv) + kfree(ahp->work_icv); + if (ahp->tfm) + crypto_free_tfm(ahp->tfm); + kfree(ahp); + } + return -EINVAL; +} + +static void ah_destroy(struct xfrm_state *x) +{ + struct ah_data *ahp = x->data; + + if (!ahp) + return; + + if (ahp->work_icv) { + kfree(ahp->work_icv); + ahp->work_icv = NULL; + } + if (ahp->tfm) { + crypto_free_tfm(ahp->tfm); + ahp->tfm = NULL; + } + kfree(ahp); +} + + +static struct xfrm_type ah_type = +{ + .description = "AH4", + .owner = THIS_MODULE, + .proto = IPPROTO_AH, + .init_state = ah_init_state, + .destructor = ah_destroy, + .input = ah_input, + .output = ah_output +}; + +static struct inet_protocol ah4_protocol = { + .handler = xfrm4_rcv, + .err_handler = ah4_err, + .no_policy = 1, +}; + +static int __init ah4_init(void) +{ + if (xfrm_register_type(&ah_type, AF_INET) < 0) { + printk(KERN_INFO "ip ah init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { + printk(KERN_INFO "ip ah init: can't add protocol\n"); + xfrm_unregister_type(&ah_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit ah4_fini(void) +{ + if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) + printk(KERN_INFO "ip ah close: can't remove protocol\n"); + if (xfrm_unregister_type(&ah_type, AF_INET) < 0) + printk(KERN_INFO "ip ah close: can't remove xfrm type\n"); +} + +module_init(ah4_init); +module_exit(ah4_fini); +MODULE_LICENSE("GPL"); Index: net/ipv4/arp.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/arp.c,v retrieving revision 1.1.1.22 retrieving revision 1.1.1.22.2.1 diff -u -r1.1.1.22 -r1.1.1.22.2.1 --- a/net/ipv4/arp.c 14 Apr 2004 13:05:41 -0000 1.1.1.22 +++ b/net/ipv4/arp.c 16 Apr 2004 13:16:21 -0000 1.1.1.22.2.1 @@ -413,11 +413,13 @@ static int arp_filter(__u32 sip, __u32 tip, struct net_device *dev) { + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip, + .saddr = tip } } }; struct rtable *rt; int flag = 0; /*unsigned long now; */ - if (ip_route_output(&rt, sip, tip, 0, 0) < 0) + if (ip_route_output_key(&rt, &fl) < 0) return 1; if (rt->u.dst.dev != dev) { NET_INC_STATS_BH(ArpFilter); @@ -563,11 +565,11 @@ */ skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4) - + dev->hard_header_len + 15, GFP_ATOMIC); + + LL_RESERVED_SPACE(dev), GFP_ATOMIC); if (skb == NULL) return NULL; - skb_reserve(skb, (dev->hard_header_len+15)&~15); + skb_reserve(skb, LL_RESERVED_SPACE(dev)); skb->nh.raw = skb->data; arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4)); skb->dev = dev; @@ -1016,8 +1018,10 @@ if (r->arp_flags & ATF_PERM) r->arp_flags |= ATF_COM; if (dev == NULL) { + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } } }; struct rtable * rt; - if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0) + if ((err = ip_route_output_key(&rt, &fl)) != 0) return err; dev = rt->u.dst.dev; ip_rt_put(rt); @@ -1099,8 +1103,10 @@ } if (dev == NULL) { + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } } }; struct rtable * rt; - if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0) + if ((err = ip_route_output_key(&rt, &fl)) != 0) return err; dev = rt->u.dst.dev; ip_rt_put(rt); Index: net/ipv4/devinet.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/devinet.c,v retrieving revision 1.1.1.24 retrieving revision 1.1.1.24.2.1 diff -u -r1.1.1.24 -r1.1.1.24.2.1 --- a/net/ipv4/devinet.c 14 Apr 2004 13:05:41 -0000 1.1.1.24 +++ b/net/ipv4/devinet.c 16 Apr 2004 13:16:21 -0000 1.1.1.24.2.1 @@ -180,7 +180,9 @@ /* in_dev_put following below will kill the in_device */ write_unlock_bh(&inetdev_lock); - +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(in_dev->arp_parms); +#endif neigh_parms_release(&arp_tbl, in_dev->arp_parms); in_dev_put(in_dev); } @@ -926,6 +928,8 @@ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); inet_insert_ifa(ifa); } + in_dev->cnf.no_xfrm = 1; + in_dev->cnf.no_policy = 1; } ip_mc_up(in_dev); break; @@ -1132,6 +1136,62 @@ return ret; } +int ipv4_doint_and_flush(ctl_table *ctl, int write, + struct file* filp, void *buffer, + size_t *lenp) +{ + int *valp = ctl->data; + int val = *valp; + int ret = proc_dointvec(ctl, write, filp, buffer, lenp); + + if (write && *valp != val) + rt_cache_flush(0); + + return ret; +} + +int ipv4_doint_and_flush_strategy(ctl_table *table, int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, + void **context) +{ + int *valp = table->data; + int new; + + if (!newval || !newlen) + return 0; + + if (newlen != sizeof(int)) + return -EINVAL; + + if (get_user(new, (int *)newval)) + return -EFAULT; + + if (new == *valp) + return 0; + + if (oldval && oldlenp) { + size_t len; + + if (get_user(len, oldlenp)) + return -EFAULT; + + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, valp, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + *valp = new; + rt_cache_flush(0); + return 1; +} + + static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; @@ -1190,6 +1250,12 @@ {NET_IPV4_CONF_ARP_IGNORE, "arp_ignore", &ipv4_devconf.arp_ignore, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_CONF_NOXFRM, "disable_xfrm", + &ipv4_devconf.no_xfrm, sizeof(int), 0644, NULL, + &ipv4_doint_and_flush, &ipv4_doint_and_flush_strategy,}, + {NET_IPV4_CONF_NOPOLICY, "disable_policy", + &ipv4_devconf.no_policy, sizeof(int), 0644, NULL, + &ipv4_doint_and_flush, &ipv4_doint_and_flush_strategy}, {NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version", &ipv4_devconf.force_igmp_version, sizeof(int), 0644, NULL, &proc_dointvec}, Index: net/ipv4/esp4.c =================================================================== RCS file: net/ipv4/esp4.c diff -N net/ipv4/esp4.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv4/esp4.c 16 Apr 2004 13:16:21 -0000 1.5.18.1 @@ -0,0 +1,613 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_SG_ONSTACK 4 + +/* decapsulation data for use when post-processing */ +struct esp_decap_data { + xfrm_address_t saddr; + __u16 sport; + __u8 proto; +}; + +int esp_output(struct sk_buff *skb) +{ + int err; + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct iphdr *iph, *top_iph; + struct ip_esp_hdr *esph; + struct crypto_tfm *tfm; + struct esp_data *esp; + struct sk_buff *trailer; + struct udphdr *uh = NULL; + struct xfrm_encap_tmpl *encap = NULL; + int blksize; + int clen; + int alen; + int nfrags; + union { + struct iphdr iph; + char buf[60]; + } tmp_iph; + + /* First, if the skb is not checksummed, complete checksum. */ + if (skb->ip_summed == CHECKSUM_HW && skb_checksum_help(skb) == NULL) { + err = -EINVAL; + goto error_nolock; + } + + spin_lock_bh(&x->lock); + err = xfrm_check_output(x, skb, AF_INET); + if (err) + goto error; + err = -ENOMEM; + + /* Strip IP header in transport mode. Save it. */ + if (!x->props.mode) { + iph = skb->nh.iph; + memcpy(&tmp_iph, iph, iph->ihl*4); + __skb_pull(skb, iph->ihl*4); + } + /* Now skb is pure payload to encrypt */ + + /* Round to block size */ + clen = skb->len; + + esp = x->data; + alen = esp->auth.icv_trunc_len; + tfm = esp->conf.tfm; + blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3; + clen = (clen + 2 + blksize-1)&~(blksize-1); + if (esp->conf.padlen) + clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1); + + if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0) + goto error; + + /* Fill padding... */ + do { + int i; + for (i=0; ilen - 2; i++) + *(u8*)(trailer->tail + i) = i+1; + } while (0); + *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2; + pskb_put(skb, trailer, clen - skb->len); + + encap = x->encap; + + iph = skb->nh.iph; + if (x->props.mode) { + top_iph = (struct iphdr*)skb_push(skb, x->props.header_len); + esph = (struct ip_esp_hdr*)(top_iph+1); + if (encap && encap->encap_type) { + switch (encap->encap_type) { + case UDP_ENCAP_ESPINUDP: + uh = (struct udphdr*) esph; + esph = (struct ip_esp_hdr*)(uh+1); + top_iph->protocol = IPPROTO_UDP; + break; + default: + printk(KERN_INFO + "esp_output(): Unhandled encap: %u\n", + encap->encap_type); + top_iph->protocol = IPPROTO_ESP; + break; + } + } else + top_iph->protocol = IPPROTO_ESP; + *(u8*)(trailer->tail - 1) = IPPROTO_IPIP; + top_iph->ihl = 5; + top_iph->version = 4; + top_iph->tos = iph->tos; /* DS disclosed */ + if (x->props.flags & XFRM_STATE_NOECN) + IP_ECN_clear(top_iph); + top_iph->tot_len = htons(skb->len + alen); + top_iph->frag_off = iph->frag_off&htons(IP_DF); + if (!(top_iph->frag_off)) + ip_select_ident(top_iph, dst, 0); + top_iph->ttl = iph->ttl; /* TTL disclosed */ + top_iph->check = 0; + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + } else { + esph = (struct ip_esp_hdr*)skb_push(skb, x->props.header_len); + top_iph = (struct iphdr*)skb_push(skb, iph->ihl*4); + memcpy(top_iph, &tmp_iph, iph->ihl*4); + if (encap && encap->encap_type) { + switch (encap->encap_type) { + case UDP_ENCAP_ESPINUDP: + uh = (struct udphdr*) esph; + esph = (struct ip_esp_hdr*)(uh+1); + top_iph->protocol = IPPROTO_UDP; + break; + default: + printk(KERN_INFO + "esp_output(): Unhandled encap: %u\n", + encap->encap_type); + top_iph->protocol = IPPROTO_ESP; + break; + } + } else + top_iph->protocol = IPPROTO_ESP; + iph = &tmp_iph.iph; + top_iph->tot_len = htons(skb->len + alen); + top_iph->check = 0; + top_iph->frag_off = iph->frag_off; + *(u8*)(trailer->tail - 1) = iph->protocol; + } + + /* this is non-NULL only with UDP Encapsulation */ + if (encap && uh) { + uh->source = encap->encap_sport; + uh->dest = encap->encap_dport; + uh->len = htons(skb->len + alen - sizeof(struct iphdr)); + uh->check = 0; + } + + esph->spi = x->id.spi; + esph->seq_no = htonl(++x->replay.oseq); + + if (esp->conf.ivlen) + crypto_cipher_set_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + + do { + struct scatterlist sgbuf[nfrags>MAX_SG_ONSTACK ? 0 : nfrags]; + struct scatterlist *sg = sgbuf; + + if (unlikely(nfrags > MAX_SG_ONSTACK)) { + sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); + if (!sg) + goto error; + } + skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen); + crypto_cipher_encrypt(tfm, sg, sg, clen); + if (unlikely(sg != sgbuf)) + kfree(sg); + } while (0); + + if (esp->conf.ivlen) { + memcpy(esph->enc_data, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + crypto_cipher_get_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + } + + if (esp->auth.icv_full_len) { + esp->auth.icv(esp, skb, (u8*)esph-skb->data, + sizeof(struct ip_esp_hdr) + esp->conf.ivlen+clen, trailer->tail); + pskb_put(skb, trailer, alen); + } + + ip_send_check(top_iph); + + skb->nh.raw = skb->data; + + x->curlft.bytes += skb->len; + x->curlft.packets++; + spin_unlock_bh(&x->lock); + if ((skb->dst = dst_pop(dst)) == NULL) { + err = -EHOSTUNREACH; + goto error_nolock; + } + return NET_XMIT_BYPASS; + +error: + spin_unlock_bh(&x->lock); +error_nolock: + kfree_skb(skb); + return err; +} + +/* + * Note: detecting truncated vs. non-truncated authentication data is very + * expensive, so we only support truncated data, which is the recommended + * and common case. + */ +int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + struct iphdr *iph; + struct ip_esp_hdr *esph; + struct esp_data *esp = x->data; + struct sk_buff *trailer; + int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm); + int alen = esp->auth.icv_trunc_len; + int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; + int nfrags; + int encap_len = 0; + + if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) + goto out; + + if (elen <= 0 || (elen & (blksize-1))) + goto out; + + /* If integrity check is required, do this. */ + if (esp->auth.icv_full_len) { + u8 sum[esp->auth.icv_full_len]; + u8 sum1[alen]; + + esp->auth.icv(esp, skb, 0, skb->len-alen, sum); + + if (skb_copy_bits(skb, skb->len-alen, sum1, alen)) + BUG(); + + if (unlikely(memcmp(sum, sum1, alen))) { + x->stats.integrity_failed++; + goto out; + } + } + + if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + + skb->ip_summed = CHECKSUM_NONE; + + esph = (struct ip_esp_hdr*)skb->data; + iph = skb->nh.iph; + + /* Get ivec. This can be wrong, check against another impls. */ + if (esp->conf.ivlen) + crypto_cipher_set_iv(esp->conf.tfm, esph->enc_data, crypto_tfm_alg_ivsize(esp->conf.tfm)); + + { + u8 nexthdr[2]; + struct scatterlist sgbuf[nfrags>MAX_SG_ONSTACK ? 0 : nfrags]; + struct scatterlist *sg = sgbuf; + u8 workbuf[60]; + int padlen; + + if (unlikely(nfrags > MAX_SG_ONSTACK)) { + sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); + if (!sg) + goto out; + } + skb_to_sgvec(skb, sg, sizeof(struct ip_esp_hdr) + esp->conf.ivlen, elen); + crypto_cipher_decrypt(esp->conf.tfm, sg, sg, elen); + if (unlikely(sg != sgbuf)) + kfree(sg); + + if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2)) + BUG(); + + padlen = nexthdr[0]; + if (padlen+2 >= elen) + goto out; + + /* ... check padding bits here. Silly. :-) */ + + if (x->encap && decap && decap->decap_type) { + struct esp_decap_data *encap_data; + struct udphdr *uh = (struct udphdr *) (iph+1); + + encap_data = (struct esp_decap_data *) (decap->decap_data); + encap_data->proto = 0; + + switch (decap->decap_type) { + case UDP_ENCAP_ESPINUDP: + + if ((void*)uh == (void*)esph) { + printk(KERN_DEBUG + "esp_input(): Got ESP; expecting ESPinUDP\n"); + break; + } + + encap_data->proto = AF_INET; + encap_data->saddr.a4 = iph->saddr; + encap_data->sport = uh->source; + encap_len = (void*)esph - (void*)uh; + if (encap_len != sizeof(*uh)) + printk(KERN_DEBUG + "esp_input(): UDP -> ESP: too much room: %d\n", + encap_len); + break; + + default: + printk(KERN_INFO + "esp_input(): processing unknown encap type: %u\n", + decap->decap_type); + break; + } + } + + iph->protocol = nexthdr[1]; + pskb_trim(skb, skb->len - alen - padlen - 2); + memcpy(workbuf, skb->nh.raw, iph->ihl*4); + skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen); + skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen; + memcpy(skb->nh.raw, workbuf, iph->ihl*4); + skb->nh.iph->tot_len = htons(skb->len); + } + + return 0; + +out: + return -EINVAL; +} + +int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + + if (x->encap) { + struct xfrm_encap_tmpl *encap; + struct esp_decap_data *decap_data; + + encap = x->encap; + decap_data = (struct esp_decap_data *)(decap->decap_data); + + /* first, make sure that the decap type == the encap type */ + if (encap->encap_type != decap->decap_type) + return -EINVAL; + + /* Next, if we don't have an encap type, then ignore it */ + if (!encap->encap_type) + return 0; + + switch (encap->encap_type) { + case UDP_ENCAP_ESPINUDP: + /* + * 1) if the NAT-T peer's IP or port changed then + * advertize the change to the keying daemon. + * This is an inbound SA, so just compare + * SRC ports. + */ + if (decap_data->proto == AF_INET && + (decap_data->saddr.a4 != x->props.saddr.a4 || + decap_data->sport != encap->encap_sport)) { + xfrm_address_t ipaddr; + + ipaddr.a4 = decap_data->saddr.a4; + km_new_mapping(x, &ipaddr, decap_data->sport); + + /* XXX: perhaps add an extra + * policy check here, to see + * if we should allow or + * reject a packet from a + * different source + * address/port. + */ + } + + /* + * 2) ignore UDP/TCP checksums in case + * of NAT-T in Transport Mode, or + * perform other post-processing fixes + * as per * draft-ietf-ipsec-udp-encaps-06, + * section 3.1.2 + */ + if (!x->props.mode) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + break; + default: + printk(KERN_INFO + "esp4_post_input(): Unhandled encap type: %u\n", + encap->encap_type); + break; + } + } + return 0; +} + +static u32 esp4_get_max_size(struct xfrm_state *x, int mtu) +{ + struct esp_data *esp = x->data; + u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm); + + if (x->props.mode) { + mtu = (mtu + 2 + blksize-1)&~(blksize-1); + } else { + /* The worst case. */ + mtu += 2 + blksize; + } + if (esp->conf.padlen) + mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1); + + return mtu + x->props.header_len + esp->auth.icv_trunc_len; +} + +void esp4_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr*)skb->data; + struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2)); + struct xfrm_state *x; + + if (skb->h.icmph->type != ICMP_DEST_UNREACH || + skb->h.icmph->code != ICMP_FRAG_NEEDED) + return; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); + if (!x) + return; + printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", + ntohl(esph->spi), ntohl(iph->daddr)); + xfrm_state_put(x); +} + +void esp_destroy(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + + if (!esp) + return; + + if (esp->conf.tfm) { + crypto_free_tfm(esp->conf.tfm); + esp->conf.tfm = NULL; + } + if (esp->conf.ivec) { + kfree(esp->conf.ivec); + esp->conf.ivec = NULL; + } + if (esp->auth.tfm) { + crypto_free_tfm(esp->auth.tfm); + esp->auth.tfm = NULL; + } + if (esp->auth.work_icv) { + kfree(esp->auth.work_icv); + esp->auth.work_icv = NULL; + } + kfree(esp); +} + +int esp_init_state(struct xfrm_state *x, void *args) +{ + struct esp_data *esp = NULL; + + /* null auth and encryption can have zero length keys */ + if (x->aalg) { + if (x->aalg->alg_key_len > 512) + goto error; + } + if (x->ealg == NULL) + goto error; + + esp = kmalloc(sizeof(*esp), GFP_KERNEL); + if (esp == NULL) + return -ENOMEM; + + memset(esp, 0, sizeof(*esp)); + + if (x->aalg) { + struct xfrm_algo_desc *aalg_desc; + + esp->auth.key = x->aalg->alg_key; + esp->auth.key_len = (x->aalg->alg_key_len+7)/8; + esp->auth.tfm = crypto_alloc_tfm(x->aalg->alg_name, 0); + if (esp->auth.tfm == NULL) + goto error; + esp->auth.icv = esp_hmac_digest; + + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_tfm_alg_digestsize(esp->auth.tfm)) { + printk(KERN_INFO "ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_tfm_alg_digestsize(esp->auth.tfm), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto error; + } + + esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + + esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL); + if (!esp->auth.work_icv) + goto error; + } + esp->conf.key = x->ealg->alg_key; + esp->conf.key_len = (x->ealg->alg_key_len+7)/8; + if (x->props.ealgo == SADB_EALG_NULL) + esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_ECB); + else + esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_CBC); + if (esp->conf.tfm == NULL) + goto error; + esp->conf.ivlen = crypto_tfm_alg_ivsize(esp->conf.tfm); + esp->conf.padlen = 0; + if (esp->conf.ivlen) { + esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL); + get_random_bytes(esp->conf.ivec, esp->conf.ivlen); + } + crypto_cipher_setkey(esp->conf.tfm, esp->conf.key, esp->conf.key_len); + x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen; + if (x->props.mode) + x->props.header_len += sizeof(struct iphdr); + if (x->encap) { + struct xfrm_encap_tmpl *encap = x->encap; + + if (encap->encap_type) { + switch (encap->encap_type) { + case UDP_ENCAP_ESPINUDP: + x->props.header_len += sizeof(struct udphdr); + break; + default: + printk (KERN_INFO + "esp_init_state(): Unhandled encap type: %u\n", + encap->encap_type); + break; + } + } + } + x->data = esp; + x->props.trailer_len = esp4_get_max_size(x, 0) - x->props.header_len; + return 0; + +error: + if (esp) { + if (esp->auth.tfm) + crypto_free_tfm(esp->auth.tfm); + if (esp->auth.work_icv) + kfree(esp->auth.work_icv); + if (esp->conf.tfm) + crypto_free_tfm(esp->conf.tfm); + kfree(esp); + } + return -EINVAL; +} + +static struct xfrm_type esp_type = +{ + .description = "ESP4", + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, + .init_state = esp_init_state, + .destructor = esp_destroy, + .get_max_size = esp4_get_max_size, + .input = esp_input, + .post_input = esp_post_input, + .output = esp_output +}; + +static struct inet_protocol esp4_protocol = { + .handler = xfrm4_rcv, + .err_handler = esp4_err, + .no_policy = 1, +}; + +int __init esp4_init(void) +{ + struct xfrm_decap_state decap; + + if (sizeof(struct esp_decap_data) < + sizeof(decap.decap_data)) { + extern void decap_data_too_small(void); + + decap_data_too_small(); + } + + SET_MODULE_OWNER(&esp_type); + if (xfrm_register_type(&esp_type, AF_INET) < 0) { + printk(KERN_INFO "ip esp init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { + printk(KERN_INFO "ip esp init: can't add protocol\n"); + xfrm_unregister_type(&esp_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit esp4_fini(void) +{ + if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) + printk(KERN_INFO "ip esp close: can't remove protocol\n"); + if (xfrm_unregister_type(&esp_type, AF_INET) < 0) + printk(KERN_INFO "ip esp close: can't remove xfrm type\n"); +} + +module_init(esp4_init); +module_exit(esp4_fini); +MODULE_LICENSE("GPL"); Index: net/ipv4/fib_frontend.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/fib_frontend.c,v retrieving revision 1.1.1.17 retrieving revision 1.1.1.17.2.1 diff -u -r1.1.1.17 -r1.1.1.17.2.1 --- a/net/ipv4/fib_frontend.c 25 Aug 2003 11:44:44 -0000 1.1.1.17 +++ b/net/ipv4/fib_frontend.c 16 Apr 2004 13:16:21 -0000 1.1.1.17.2.1 @@ -144,17 +144,15 @@ struct net_device * ip_dev_find(u32 addr) { - struct rt_key key; + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; struct fib_result res; struct net_device *dev = NULL; - memset(&key, 0, sizeof(key)); - key.dst = addr; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif - if (!local_table || local_table->tb_lookup(local_table, &key, &res)) { + if (!local_table || local_table->tb_lookup(local_table, &fl, &res)) { return NULL; } if (res.type != RTN_LOCAL) @@ -170,7 +168,7 @@ unsigned inet_addr_type(u32 addr) { - struct rt_key key; + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; struct fib_result res; unsigned ret = RTN_BROADCAST; @@ -179,15 +177,13 @@ if (MULTICAST(addr)) return RTN_MULTICAST; - memset(&key, 0, sizeof(key)); - key.dst = addr; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif if (local_table) { ret = RTN_UNICAST; - if (local_table->tb_lookup(local_table, &key, &res) == 0) { + if (local_table->tb_lookup(local_table, &fl, &res) == 0) { ret = res.type; fib_res_put(&res); } @@ -207,18 +203,15 @@ struct net_device *dev, u32 *spec_dst, u32 *itag) { struct in_device *in_dev; - struct rt_key key; + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = src, + .saddr = dst, + .tos = tos } }, + .iif = oif }; struct fib_result res; int no_addr, rpf; int ret; - key.dst = src; - key.src = dst; - key.tos = tos; - key.oif = 0; - key.iif = oif; - key.scope = RT_SCOPE_UNIVERSE; - no_addr = rpf = 0; read_lock(&inetdev_lock); in_dev = __in_dev_get(dev); @@ -231,7 +224,7 @@ if (in_dev == NULL) goto e_inval; - if (fib_lookup(&key, &res)) + if (fib_lookup(&fl, &res)) goto last_resort; if (res.type != RTN_UNICAST) goto e_inval_res; @@ -252,10 +245,10 @@ goto last_resort; if (rpf) goto e_inval; - key.oif = dev->ifindex; + fl.oif = dev->ifindex; ret = 0; - if (fib_lookup(&key, &res) == 0) { + if (fib_lookup(&fl, &res) == 0) { if (res.type == RTN_UNICAST) { *spec_dst = FIB_RES_PREFSRC(res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; Index: net/ipv4/fib_hash.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/fib_hash.c,v retrieving revision 1.1.1.14 retrieving revision 1.1.1.14.2.1 diff -u -r1.1.1.14 -r1.1.1.14.2.1 --- a/net/ipv4/fib_hash.c 25 Aug 2003 11:44:44 -0000 1.1.1.14 +++ b/net/ipv4/fib_hash.c 16 Apr 2004 13:16:21 -0000 1.1.1.14.2.1 @@ -290,7 +290,7 @@ } static int -fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result *res) +fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) { int err; struct fn_zone *fz; @@ -299,7 +299,7 @@ read_lock(&fib_hash_lock); for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { struct fib_node *f; - fn_key_t k = fz_key(key->dst, fz); + fn_key_t k = fz_key(flp->fl4_dst, fz); for (f = fz_chain(k, fz); f; f = f->fn_next) { if (!fn_key_eq(k, f->fn_key)) { @@ -309,17 +309,17 @@ continue; } #ifdef CONFIG_IP_ROUTE_TOS - if (f->fn_tos && f->fn_tos != key->tos) + if (f->fn_tos && f->fn_tos != flp->fl4_tos) continue; #endif f->fn_state |= FN_S_ACCESSED; if (f->fn_state&FN_S_ZOMBIE) continue; - if (f->fn_scope < key->scope) + if (f->fn_scope < flp->fl4_scope) continue; - err = fib_semantic_match(f->fn_type, FIB_INFO(f), key, res); + err = fib_semantic_match(f->fn_type, FIB_INFO(f), flp, res); if (err == 0) { res->type = f->fn_type; res->scope = f->fn_scope; @@ -362,7 +362,7 @@ } static void -fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fib_result *res) +fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) { int order, last_idx; struct fib_node *f; Index: net/ipv4/fib_rules.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/fib_rules.c,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/net/ipv4/fib_rules.c 18 Feb 2004 13:36:32 -0000 1.1.1.15 +++ b/net/ipv4/fib_rules.c 16 Apr 2004 13:16:21 -0000 1.1.1.15.2.1 @@ -307,28 +307,28 @@ } } -int fib_lookup(const struct rt_key *key, struct fib_result *res) +int fib_lookup(const struct flowi *flp, struct fib_result *res) { int err; struct fib_rule *r, *policy; struct fib_table *tb; - u32 daddr = key->dst; - u32 saddr = key->src; + u32 daddr = flp->fl4_dst; + u32 saddr = flp->fl4_src; FRprintk("Lookup: %u.%u.%u.%u <- %u.%u.%u.%u ", - NIPQUAD(key->dst), NIPQUAD(key->src)); + NIPQUAD(flp->fl4_dst), NIPQUAD(flp->fl4_src)); read_lock(&fib_rules_lock); for (r = fib_rules; r; r=r->r_next) { if (((saddr^r->r_src) & r->r_srcmask) || ((daddr^r->r_dst) & r->r_dstmask) || #ifdef CONFIG_IP_ROUTE_TOS - (r->r_tos && r->r_tos != key->tos) || + (r->r_tos && r->r_tos != flp->fl4_tos) || #endif #ifdef CONFIG_IP_ROUTE_FWMARK - (r->r_fwmark && r->r_fwmark != key->fwmark) || + (r->r_fwmark && r->r_fwmark != flp->fl4_fwmark) || #endif - (r->r_ifindex && r->r_ifindex != key->iif)) + (r->r_ifindex && r->r_ifindex != flp->iif)) continue; FRprintk("tb %d r %d ", r->r_table, r->r_action); @@ -351,7 +351,7 @@ if ((tb = fib_get_table(r->r_table)) == NULL) continue; - err = tb->tb_lookup(tb, key, res); + err = tb->tb_lookup(tb, flp, res); if (err == 0) { res->r = policy; if (policy) @@ -369,13 +369,13 @@ return -ENETUNREACH; } -void fib_select_default(const struct rt_key *key, struct fib_result *res) +void fib_select_default(const struct flowi *flp, struct fib_result *res) { if (res->r && res->r->r_action == RTN_UNICAST && FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) { struct fib_table *tb; if ((tb = fib_get_table(res->r->r_table)) != NULL) - tb->tb_select_default(tb, key, res); + tb->tb_select_default(tb, flp, res); } } Index: net/ipv4/fib_semantics.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/fib_semantics.c,v retrieving revision 1.1.1.18 retrieving revision 1.1.1.18.2.1 diff -u -r1.1.1.18 -r1.1.1.18.2.1 --- a/net/ipv4/fib_semantics.c 25 Aug 2003 11:44:44 -0000 1.1.1.18 +++ b/net/ipv4/fib_semantics.c 16 Apr 2004 13:16:21 -0000 1.1.1.18.2.1 @@ -349,7 +349,6 @@ int err; if (nh->nh_gw) { - struct rt_key key; struct fib_result res; #ifdef CONFIG_IP_ROUTE_PERVASIVE @@ -372,16 +371,18 @@ nh->nh_scope = RT_SCOPE_LINK; return 0; } - memset(&key, 0, sizeof(key)); - key.dst = nh->nh_gw; - key.oif = nh->nh_oif; - key.scope = r->rtm_scope + 1; - - /* It is not necessary, but requires a bit of thinking */ - if (key.scope < RT_SCOPE_LINK) - key.scope = RT_SCOPE_LINK; - if ((err = fib_lookup(&key, &res)) != 0) - return err; + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = nh->nh_gw, + .scope = r->rtm_scope + 1 } }, + .oif = nh->nh_oif }; + + /* It is not necessary, but requires a bit of thinking */ + if (fl.fl4_scope < RT_SCOPE_LINK) + fl.fl4_scope = RT_SCOPE_LINK; + if ((err = fib_lookup(&fl, &res)) != 0) + return err; + } err = -EINVAL; if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) goto out; @@ -578,7 +579,7 @@ } int -fib_semantic_match(int type, struct fib_info *fi, const struct rt_key *key, struct fib_result *res) +fib_semantic_match(int type, struct fib_info *fi, const struct flowi *flp, struct fib_result *res) { int err = fib_props[type].error; @@ -603,7 +604,7 @@ for_nexthops(fi) { if (nh->nh_flags&RTNH_F_DEAD) continue; - if (!key->oif || key->oif == nh->nh_oif) + if (!flp->oif || flp->oif == nh->nh_oif) break; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -949,7 +950,7 @@ fair weighted route distribution. */ -void fib_select_multipath(const struct rt_key *key, struct fib_result *res) +void fib_select_multipath(const struct flowi *flp, struct fib_result *res) { struct fib_info *fi = res->fi; int w; Index: net/ipv4/icmp.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/icmp.c,v retrieving revision 1.1.1.25 retrieving revision 1.1.1.25.2.1 diff -u -r1.1.1.25 -r1.1.1.25.2.1 --- a/net/ipv4/icmp.c 14 Apr 2004 13:05:41 -0000 1.1.1.25 +++ b/net/ipv4/icmp.c 16 Apr 2004 13:16:21 -0000 1.1.1.25.2.1 @@ -101,7 +101,6 @@ int offset; int data_len; - unsigned int csum; struct { struct icmphdr icmph; __u32 times[3]; @@ -139,8 +138,6 @@ { EHOSTUNREACH, 1 } /* ICMP_PREC_CUTOFF */ }; -extern int sysctl_ip_default_ttl; - /* Control parameters for ECHO replies. */ int sysctl_icmp_echo_ignore_all; int sysctl_icmp_echo_ignore_broadcasts; @@ -281,37 +278,47 @@ * Checksum each fragment, and on the first include the headers and final checksum. */ -static int icmp_glue_bits(const void *p, char *to, unsigned int offset, unsigned int fraglen) +int +icmp_glue_bits(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { - struct icmp_bxm *icmp_param = (struct icmp_bxm *)p; - struct icmphdr *icmph; + struct icmp_bxm *icmp_param = (struct icmp_bxm *)from; unsigned int csum; - if (offset) { - icmp_param->csum=skb_copy_and_csum_bits(icmp_param->skb, - icmp_param->offset+(offset-icmp_param->head_len), - to, fraglen,icmp_param->csum); - return 0; - } + csum = skb_copy_and_csum_bits(icmp_param->skb, + icmp_param->offset + offset, + to, len, 0); - /* - * First fragment includes header. Note that we've done - * the other fragments first, so that we get the checksum - * for the whole packet here. - */ - csum = csum_partial_copy_nocheck((void *)&icmp_param->data, - to, icmp_param->head_len, - icmp_param->csum); - csum=skb_copy_and_csum_bits(icmp_param->skb, - icmp_param->offset, - to+icmp_param->head_len, - fraglen-icmp_param->head_len, - csum); - icmph=(struct icmphdr *)to; - icmph->checksum = csum_fold(csum); + skb->csum = csum_block_add(skb->csum, csum, odd); return 0; } +static void +icmp_push_reply(struct icmp_bxm *icmp_param, struct ipcm_cookie *ipc, struct rtable *rt) +{ + struct sk_buff *skb; + + ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param, + icmp_param->data_len+icmp_param->head_len, + icmp_param->head_len, + ipc, rt, MSG_DONTWAIT); + + if ((skb = skb_peek(&icmp_socket->sk->write_queue)) != NULL) { + struct icmphdr *icmph = skb->h.icmph; + unsigned int csum = 0; + struct sk_buff *skb1; + + skb_queue_walk(&icmp_socket->sk->write_queue, skb1) { + csum = csum_add(csum, skb1->csum); + } + csum = csum_partial_copy_nocheck((void *)&icmp_param->data, + (char*)icmph, icmp_param->head_len, + csum); + icmph->checksum = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + ip_push_pending_frames(icmp_socket->sk); + } +} + /* * Driving logic for building and sending ICMP messages. */ @@ -330,11 +337,9 @@ return; icmp_param->data.icmph.checksum=0; - icmp_param->csum=0; icmp_out_count(icmp_param->data.icmph.type); sk->protinfo.af_inet.tos = skb->nh.iph->tos; - sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; if (icmp_param->replyopts.optlen) { @@ -342,14 +347,18 @@ if (ipc.opt->srr) daddr = icmp_param->replyopts.faddr; } - if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) - goto out; - if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type, - icmp_param->data.icmph.code)) { - ip_build_xmit(sk, icmp_glue_bits, icmp_param, - icmp_param->data_len+icmp_param->head_len, - &ipc, rt, MSG_DONTWAIT); + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = rt->rt_spec_dst, + .tos = RT_TOS(skb->nh.iph->tos) } }, + .proto = IPPROTO_ICMP }; + if (ip_route_output_key(&rt, &fl)) + goto out; } + if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type, + icmp_param->data.icmph.code)) + icmp_push_reply(icmp_param, &ipc, rt); ip_rt_put(rt); out: icmp_xmit_unlock(); @@ -446,8 +455,8 @@ * Restore original addresses if packet has been translated. */ if (rt->rt_flags&RTCF_NAT && IPCB(skb_in)->flags&IPSKB_TRANSLATED) { - iph->daddr = rt->key.dst; - iph->saddr = rt->key.src; + iph->daddr = rt->fl.fl4_dst; + iph->saddr = rt->fl.fl4_src; } #endif @@ -459,9 +468,14 @@ ((iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL) : iph->tos; - if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), 0)) - goto out; - + { + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = iph->saddr, + .saddr = saddr, + .tos = RT_TOS(tos) } }, + .proto = IPPROTO_ICMP }; + if (ip_route_output_key(&rt, &fl)) + goto out; + } if (ip_options_echo(&icmp_param.replyopts, skb_in)) goto ende; @@ -474,17 +488,20 @@ icmp_param.data.icmph.code=code; icmp_param.data.icmph.un.gateway = info; icmp_param.data.icmph.checksum=0; - icmp_param.csum=0; icmp_param.skb=skb_in; icmp_param.offset=skb_in->nh.raw - skb_in->data; icmp_out_count(icmp_param.data.icmph.type); icmp_socket->sk->protinfo.af_inet.tos = tos; - icmp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts; if (icmp_param.replyopts.srr) { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = icmp_param.replyopts.faddr, + .saddr = saddr, + .tos = RT_TOS(tos) } }, + .proto = IPPROTO_ICMP }; ip_rt_put(rt); - if (ip_route_output(&rt, icmp_param.replyopts.faddr, saddr, RT_TOS(tos), 0)) + if (ip_route_output_key(&rt, &fl)) goto out; } @@ -493,7 +510,7 @@ /* RFC says return as much as we can without exceeding 576 bytes. */ - room = rt->u.dst.pmtu; + room = dst_pmtu(&rt->u.dst); if (room > 576) room = 576; room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; @@ -504,9 +521,7 @@ icmp_param.data_len = room; icmp_param.head_len = sizeof(struct icmphdr); - ip_build_xmit(icmp_socket->sk, icmp_glue_bits, &icmp_param, - icmp_param.data_len+sizeof(struct icmphdr), - &ipc, rt, MSG_DONTWAIT); + icmp_push_reply(&icmp_param, &ipc, rt); ende: ip_rt_put(rt); @@ -645,24 +660,10 @@ * we are OK. */ - ipprot = (struct inet_protocol *) inet_protos[hash]; - while (ipprot) { - struct inet_protocol *nextip; - - nextip = (struct inet_protocol *) ipprot->next; - - /* - * Pass it off to everyone who wants it. - */ + ipprot = inet_protos[hash]; + if (ipprot && ipprot->err_handler) + ipprot->err_handler(skb, info); - /* RFC1122: OK. Passes appropriate ICMP errors to the */ - /* appropriate protocol layer (MUST), as per 3.2.2. */ - - if (protocol == ipprot->protocol && ipprot->err_handler) - ipprot->err_handler(skb, info); - - ipprot = nextip; - } out:; } @@ -991,7 +992,7 @@ icmp_socket_cpu(i)->sk->sndbuf = (2 * ((64 * 1024) + sizeof(struct sk_buff))); - icmp_socket_cpu(i)->sk->protinfo.af_inet.ttl = MAXTTL; + icmp_socket_cpu(i)->sk->protinfo.af_inet.uc_ttl = -1; icmp_socket_cpu(i)->sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT; /* Unhash it so that IP input processing does not even Index: net/ipv4/igmp.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/igmp.c,v retrieving revision 1.1.1.20 retrieving revision 1.1.1.20.2.1 diff -u -r1.1.1.20 -r1.1.1.20.2.1 --- a/net/ipv4/igmp.c 14 Apr 2004 13:05:41 -0000 1.1.1.20 +++ b/net/ipv4/igmp.c 16 Apr 2004 13:16:22 -0000 1.1.1.20.2.1 @@ -218,15 +218,6 @@ #define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4) -/* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook - changes route */ -static inline int -output_maybe_reroute(struct sk_buff *skb) -{ - return skb->dst->output(skb); -} - - static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type, int gdeleted, int sdeleted) { @@ -283,13 +274,18 @@ u32 dst; dst = IGMPV3_ALL_MCR; - if (ip_route_output(&rt, dst, 0, 0, dev->ifindex)) - return 0; + { + struct flowi fl = { .oif = dev->ifindex, + .nl_u = { .ip4_u = { .daddr = dst } }, + .proto = IPPROTO_IGMP }; + if (ip_route_output_key(&rt, &fl)) + return 0; + } if (rt->rt_src == 0) { ip_rt_put(rt); return 0; } - skb = alloc_skb(size + dev->hard_header_len + 15, GFP_ATOMIC); + skb = alloc_skb(size + LL_RESERVED_SPACE(dev), GFP_ATOMIC); if (skb == NULL) { ip_rt_put(rt); return 0; @@ -298,7 +294,7 @@ skb->dst = &rt->u.dst; skb->dev = dev; - skb_reserve(skb, (dev->hard_header_len+15)&~15); + skb_reserve(skb, LL_RESERVED_SPACE(dev)); skb->nh.iph = pip =(struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); @@ -341,7 +337,7 @@ pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen); return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev, - output_maybe_reroute); + dst_output); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) @@ -623,14 +619,19 @@ else dst = group; - if (ip_route_output(&rt, dst, 0, 0, dev->ifindex)) - return -1; + { + struct flowi fl = { .oif = dev->ifindex, + .nl_u = { .ip4_u = { .daddr = dst } }, + .proto = IPPROTO_IGMP }; + if (ip_route_output_key(&rt, &fl)) + return -1; + } if (rt->rt_src == 0) { ip_rt_put(rt); return -1; } - skb=alloc_skb(IGMP_SIZE+dev->hard_header_len+15, GFP_ATOMIC); + skb=alloc_skb(IGMP_SIZE+LL_RESERVED_SPACE(dev), GFP_ATOMIC); if (skb == NULL) { ip_rt_put(rt); return -1; @@ -638,7 +639,7 @@ skb->dst = &rt->u.dst; - skb_reserve(skb, (dev->hard_header_len+15)&~15); + skb_reserve(skb, LL_RESERVED_SPACE(dev)); skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); @@ -666,7 +667,7 @@ ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr)); return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - output_maybe_reroute); + dst_output); } static void igmp_gq_timer_expire(unsigned long data) @@ -874,7 +875,7 @@ case IGMPV2_HOST_MEMBERSHIP_REPORT: case IGMPV3_HOST_MEMBERSHIP_REPORT: /* Is it our report looped back? */ - if (((struct rtable*)skb->dst)->key.iif == 0) + if (((struct rtable*)skb->dst)->fl.iif == 0) break; igmp_heard_report(in_dev, ih->group); break; @@ -1283,6 +1284,8 @@ static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = imr->imr_multiaddr.s_addr } } }; struct rtable *rt; struct net_device *dev = NULL; struct in_device *idev = NULL; @@ -1300,7 +1303,7 @@ __dev_put(dev); } - if (!dev && !ip_route_output(&rt, imr->imr_multiaddr.s_addr, 0, 0, 0)) { + if (!dev && !ip_route_output_key(&rt, &fl)) { dev = rt->u.dst.dev; ip_rt_put(rt); } Index: net/ipv4/ip_forward.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/ip_forward.c,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/net/ipv4/ip_forward.c 12 Apr 2001 19:11:39 -0000 1.1.1.15 +++ b/net/ipv4/ip_forward.c 16 Apr 2004 13:16:22 -0000 1.1.1.15.2.1 @@ -40,6 +40,7 @@ #include #include #include +#include static inline int ip_forward_finish(struct sk_buff *skb) { @@ -47,36 +48,20 @@ IP_INC_STATS_BH(IpForwDatagrams); - if (opt->optlen == 0) { -#ifdef CONFIG_NET_FASTROUTE - struct rtable *rt = (struct rtable*)skb->dst; - - if (rt->rt_flags&RTCF_FAST && !netdev_fastroute_obstacles) { - struct dst_entry *old_dst; - unsigned h = ((*(u8*)&rt->key.dst)^(*(u8*)&rt->key.src))&NETDEV_FASTROUTE_HMASK; - - write_lock_irq(&skb->dev->fastpath_lock); - old_dst = skb->dev->fastpath[h]; - skb->dev->fastpath[h] = dst_clone(&rt->u.dst); - write_unlock_irq(&skb->dev->fastpath_lock); - - dst_release(old_dst); - } -#endif - return (ip_send(skb)); - } + if (unlikely(opt->optlen)) + ip_forward_options(skb); - ip_forward_options(skb); - return (ip_send(skb)); + return dst_output(skb); } int ip_forward(struct sk_buff *skb) { - struct net_device *dev2; /* Output device */ struct iphdr *iph; /* Our header */ struct rtable *rt; /* Route we use */ struct ip_options * opt = &(IPCB(skb)->opt); - unsigned short mtu; + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) + goto drop; if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) return NET_RX_SUCCESS; @@ -93,32 +78,21 @@ */ iph = skb->nh.iph; - rt = (struct rtable*)skb->dst; if (iph->ttl <= 1) goto too_many_hops; - if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) - goto sr_failed; - - /* - * Having picked a route we can now send the frame out - * after asking the firewall permission to do so. - */ + if (!xfrm4_route_forward(skb)) + goto drop; - skb->priority = rt_tos2priority(iph->tos); - dev2 = rt->u.dst.dev; - mtu = rt->u.dst.pmtu; + iph = skb->nh.iph; + rt = (struct rtable*)skb->dst; - /* - * We now generate an ICMP HOST REDIRECT giving the route - * we calculated. - */ - if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr) - ip_rt_send_redirect(skb); + if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + goto sr_failed; /* We are about to mangle packet. Copy it! */ - if (skb_cow(skb, dev2->hard_header_len)) + if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) goto drop; iph = skb->nh.iph; @@ -126,30 +100,17 @@ ip_decrease_ttl(iph); /* - * We now may allocate a new buffer, and copy the datagram into it. - * If the indicated interface is up and running, kick it. + * We now generate an ICMP HOST REDIRECT giving the route + * we calculated. */ + if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr) + ip_rt_send_redirect(skb); - if (skb->len > mtu && (ntohs(iph->frag_off) & IP_DF)) - goto frag_needed; - -#ifdef CONFIG_IP_ROUTE_NAT - if (rt->rt_flags & RTCF_NAT) { - if (ip_do_nat(skb)) { - kfree_skb(skb); - return NET_RX_BAD; - } - } -#endif + skb->priority = rt_tos2priority(iph->tos); - return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev2, + return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev, ip_forward_finish); -frag_needed: - IP_INC_STATS_BH(IpFragFails); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - goto drop; - sr_failed: /* * Strict routing permits no gatewaying Index: net/ipv4/ip_gre.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/ip_gre.c,v retrieving revision 1.1.1.24 retrieving revision 1.1.1.24.2.1 diff -u -r1.1.1.24 -r1.1.1.24.2.1 --- a/net/ipv4/ip_gre.c 28 Nov 2003 18:26:21 -0000 1.1.1.24 +++ b/net/ipv4/ip_gre.c 16 Apr 2004 13:16:22 -0000 1.1.1.24.2.1 @@ -37,6 +37,7 @@ #include #include #include +#include #ifdef CONFIG_IPV6 #include @@ -410,6 +411,7 @@ u16 flags; int grehlen = (iph->ihl<<2) + 4; struct sk_buff *skb2; + struct flowi fl; struct rtable *rt; if (p[1] != htons(ETH_P_IP)) @@ -486,7 +488,11 @@ skb2->nh.raw = skb2->data; /* Try to guess incoming interface */ - if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) { + memset(&fl, 0, sizeof(fl)); + fl.fl4_dst = eiph->saddr; + fl.fl4_tos = RT_TOS(eiph->tos); + fl.proto = IPPROTO_GRE; + if (ip_route_output_key(&rt, &fl)) { kfree_skb(skb2); return; } @@ -496,7 +502,10 @@ if (rt->rt_flags&RTCF_LOCAL) { ip_rt_put(rt); rt = NULL; - if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) || + fl.fl4_dst = eiph->daddr; + fl.fl4_src = eiph->saddr; + fl.fl4_tos = eiph->tos; + if (ip_route_output_key(&rt, &fl) || rt->u.dst.dev->type != ARPHRD_IPGRE) { ip_rt_put(rt); kfree_skb(skb2); @@ -513,11 +522,11 @@ /* change mtu on this route */ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - if (rel_info > skb2->dst->pmtu) { + if (rel_info > dst_pmtu(skb2->dst)) { kfree_skb(skb2); return; } - skb2->dst->pmtu = rel_info; + skb2->dst->ops->update_pmtu(skb2->dst, rel_info); rel_info = htonl(rel_info); } else if (type == ICMP_TIME_EXCEEDED) { struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; @@ -606,6 +615,8 @@ read_lock(&ipgre_lock); if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) { + secpath_reset(skb); + skb->mac.raw = skb->nh.raw; skb->nh.raw = __pskb_pull(skb, offset); memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); @@ -617,7 +628,7 @@ #ifdef CONFIG_NET_IPGRE_BROADCAST if (MULTICAST(iph->daddr)) { /* Looped back packet, drop it! */ - if (((struct rtable*)skb->dst)->key.iif == 0) + if (((struct rtable*)skb->dst)->fl.iif == 0) goto drop; tunnel->stat.multicast++; skb->pkt_type = PACKET_BROADCAST; @@ -665,12 +676,6 @@ return(0); } -/* Need this wrapper because NF_HOOK takes the function address */ -static inline int do_ip_send(struct sk_buff *skb) -{ - return ip_send(skb); -} - static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; @@ -747,9 +752,17 @@ tos &= ~1; } - if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { - tunnel->stat.tx_carrier_errors++; - goto tx_error; + { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) } }, + .proto = IPPROTO_GRE }; + if (ip_route_output_key(&rt, &fl)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error; + } } tdev = rt->u.dst.dev; @@ -761,14 +774,14 @@ df = tiph->frag_off; if (df) - mtu = rt->u.dst.pmtu - tunnel->hlen; + mtu = dst_pmtu(&rt->u.dst) - tunnel->hlen; else - mtu = skb->dst ? skb->dst->pmtu : dev->mtu; + mtu = skb->dst ? dst_pmtu(skb->dst) : dev->mtu; - if (skb->protocol == htons(ETH_P_IP)) { - if (skb->dst && mtu < skb->dst->pmtu && mtu >= 68) - skb->dst->pmtu = mtu; + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + if (skb->protocol == htons(ETH_P_IP)) { df |= (old_iph->frag_off&htons(IP_DF)); if ((old_iph->frag_off&htons(IP_DF)) && @@ -782,11 +795,11 @@ else if (skb->protocol == htons(ETH_P_IPV6)) { struct rt6_info *rt6 = (struct rt6_info*)skb->dst; - if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= IPV6_MIN_MTU) { + if (rt6 && mtu < dst_pmtu(skb->dst) && mtu >= IPV6_MIN_MTU) { if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) || rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; - skb->dst->pmtu = mtu; + skb->dst->metrics[RTAX_MTU-1] = mtu; } } @@ -807,7 +820,7 @@ tunnel->err_count = 0; } - max_headroom = ((tdev->hard_header_len+15)&~15)+ gre_hlen; + max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen; if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); @@ -852,7 +865,7 @@ iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit; #endif else - iph->ttl = sysctl_ip_default_ttl; + iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); } ((u16*)(iph+1))[0] = tunnel->parms.o_flags; @@ -1102,10 +1115,14 @@ MOD_INC_USE_COUNT; if (MULTICAST(t->parms.iph.daddr)) { + struct flowi fl = { .oif = t->parms.link, + .nl_u = { .ip4_u = + { .daddr = t->parms.iph.daddr, + .saddr = t->parms.iph.saddr, + .tos = RT_TOS(t->parms.iph.tos) } }, + .proto = IPPROTO_GRE }; struct rtable *rt; - if (ip_route_output(&rt, t->parms.iph.daddr, - t->parms.iph.saddr, RT_TOS(t->parms.iph.tos), - t->parms.link)) { + if (ip_route_output_key(&rt, &fl)) { MOD_DEC_USE_COUNT; return -EADDRNOTAVAIL; } @@ -1175,8 +1192,14 @@ /* Guess output device to choose reasonable mtu and hard_header_len */ if (iph->daddr) { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_GRE }; struct rtable *rt; - if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) { + if (!ip_route_output_key(&rt, &fl)) { tdev = rt->u.dst.dev; ip_rt_put(rt); } @@ -1257,13 +1280,8 @@ static struct inet_protocol ipgre_protocol = { - ipgre_rcv, /* GRE handler */ - ipgre_err, /* TUNNEL error control */ - 0, /* next */ - IPPROTO_GRE, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "GRE" /* name */ + .handler = ipgre_rcv, + .err_handler = ipgre_err, }; @@ -1279,9 +1297,13 @@ { printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); + if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) { + printk(KERN_INFO "ipgre init: can't add protocol\n"); + return -EAGAIN; + } + ipgre_fb_tunnel_dev.priv = (void*)&ipgre_fb_tunnel; register_netdev(&ipgre_fb_tunnel_dev); - inet_add_protocol(&ipgre_protocol); return 0; } @@ -1289,7 +1311,7 @@ void cleanup_module(void) { - if ( inet_del_protocol(&ipgre_protocol) < 0 ) + if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) printk(KERN_INFO "ipgre close: can't remove protocol\n"); unregister_netdev(&ipgre_fb_tunnel_dev); Index: net/ipv4/ip_input.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/ip_input.c,v retrieving revision 1.1.1.20 retrieving revision 1.1.1.20.2.1 diff -u -r1.1.1.20 -r1.1.1.20.2.1 --- a/net/ipv4/ip_input.c 3 Aug 2002 00:39:46 -0000 1.1.1.20 +++ b/net/ipv4/ip_input.c 16 Apr 2004 13:16:22 -0000 1.1.1.20.2.1 @@ -141,6 +141,7 @@ #include #include #include +#include #include #include @@ -194,34 +195,13 @@ return 0; } -/* Handle this out of line, it is rare. */ -static int ip_run_ipprot(struct sk_buff *skb, struct iphdr *iph, - struct inet_protocol *ipprot, int force_copy) -{ - int ret = 0; - - do { - if (ipprot->protocol == iph->protocol) { - struct sk_buff *skb2 = skb; - if (ipprot->copy || force_copy) - skb2 = skb_clone(skb, GFP_ATOMIC); - if(skb2 != NULL) { - ret = 1; - ipprot->handler(skb2); - } - } - ipprot = (struct inet_protocol *) ipprot->next; - } while(ipprot != NULL); - - return ret; -} - static inline int ip_local_deliver_finish(struct sk_buff *skb) { int ihl = skb->nh.iph->ihl*4; #ifdef CONFIG_NETFILTER_DEBUG nf_debug_ip_local_deliver(skb); + skb->nf_debug = 0; #endif /*CONFIG_NETFILTER_DEBUG*/ __skb_pull(skb, ihl); @@ -239,44 +219,40 @@ { /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ int protocol = skb->nh.iph->protocol; - int hash = protocol & (MAX_INET_PROTOS - 1); - struct sock *raw_sk = raw_v4_htable[hash]; + int hash; + struct sock *raw_sk; struct inet_protocol *ipprot; - int flag; + + resubmit: + hash = protocol & (MAX_INET_PROTOS - 1); + raw_sk = raw_v4_htable[hash]; /* If there maybe a raw socket we must check - if not we * don't care less */ - if(raw_sk != NULL) - raw_sk = raw_v4_input(skb, skb->nh.iph, hash); + if (raw_sk) + raw_v4_input(skb, skb->nh.iph, hash); - ipprot = (struct inet_protocol *) inet_protos[hash]; - flag = 0; - if(ipprot != NULL) { - if(raw_sk == NULL && - ipprot->next == NULL && - ipprot->protocol == protocol) { - int ret; - - /* Fast path... */ - ret = ipprot->handler(skb); - - return ret; - } else { - flag = ip_run_ipprot(skb, skb->nh.iph, ipprot, (raw_sk != NULL)); - } - } + if ((ipprot = inet_protos[hash]) != NULL) { + int ret; - /* All protocols checked. - * If this packet was a broadcast, we may *not* reply to it, since that - * causes (proven, grin) ARP storms and a leakage of memory (i.e. all - * ICMP reply messages get queued up for transmission...) - */ - if(raw_sk != NULL) { /* Shift to last raw user */ - raw_rcv(raw_sk, skb); - sock_put(raw_sk); - } else if (!flag) { /* Free and report errors */ - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + if (!ipprot->no_policy && + !xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return 0; + } + ret = ipprot->handler(skb); + if (ret < 0) { + protocol = -ret; + goto resubmit; + } + } else { + if (!raw_sk) { + if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PROT_UNREACH, 0); + } + } kfree_skb(skb); } } @@ -364,7 +340,7 @@ } } - return skb->dst->input(skb); + return dst_input(skb); inhdr_error: IP_INC_STATS_BH(IpInHdrErrors); Index: net/ipv4/ip_nat_dumb.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/ip_nat_dumb.c,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/net/ipv4/ip_nat_dumb.c 12 Apr 2001 19:11:39 -0000 1.1.1.15 +++ b/net/ipv4/ip_nat_dumb.c 16 Apr 2004 13:16:22 -0000 1.1.1.15.2.1 @@ -117,23 +117,23 @@ if (rt->rt_flags&RTCF_SNAT) { if (ciph->daddr != osaddr) { struct fib_result res; - struct rt_key key; unsigned flags = 0; - - key.src = ciph->daddr; - key.dst = ciph->saddr; - key.iif = skb->dev->ifindex; - key.oif = 0; + struct flowi fl = { + .iif = skb->dev->ifindex, + .nl_u = + { .ip4_u = + { .daddr = ciph->saddr, + .saddr = ciph->daddr, #ifdef CONFIG_IP_ROUTE_TOS - key.tos = RT_TOS(ciph->tos); -#endif -#ifdef CONFIG_IP_ROUTE_FWMARK - key.fwmark = 0; + .tos = RT_TOS(ciph->tos) #endif + } }, + .proto = ciph->protocol }; + /* Use fib_lookup() until we get our own * hash table of NATed hosts -- Rani */ - if (fib_lookup(&key, &res) == 0) { + if (fib_lookup(&fl, &res) == 0) { if (res.r) { ciph->daddr = fib_rules_policy(ciph->daddr, &res, &flags); if (ciph->daddr != idaddr) Index: net/ipv4/ip_output.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/ip_output.c,v retrieving revision 1.1.1.24 retrieving revision 1.1.1.24.2.1 diff -u -r1.1.1.24 -r1.1.1.24.2.1 --- a/net/ipv4/ip_output.c 28 Nov 2003 18:26:21 -0000 1.1.1.24 +++ b/net/ipv4/ip_output.c 16 Apr 2004 13:16:22 -0000 1.1.1.24.2.1 @@ -15,6 +15,7 @@ * Stefan Becker, * Jorge Cwik, * Arnt Gulbrandsen, + * Hirokazu Takahashi, * * See ip_input.c for original log * @@ -38,6 +39,9 @@ * Marc Boucher : When call_out_firewall returns FW_QUEUE, * silently drop skb instead of failing with -EPERM. * Detlev Wengorz : Copy protocol for fragments. + * Hirokazu Takahashi: HW checksumming for outgoing UDP + * datagrams. + * Hirokazu Takahashi: sendfile() on UDP works now. */ #include @@ -108,16 +112,18 @@ return 0; } -/* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook - changes route */ -static inline int -output_maybe_reroute(struct sk_buff *skb) +static inline int ip_select_ttl(struct inet_opt *inet, struct dst_entry *dst) { - return skb->dst->output(skb); + int ttl = inet->uc_ttl; + + if (ttl < 0) + ttl = dst_metric(dst, RTAX_HOPLIMIT); + return ttl; } /* * Add an ip header to a skbuff and send it out. + * */ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, struct ip_options *opt) @@ -138,7 +144,7 @@ iph->frag_off = htons(IP_DF); else iph->frag_off = 0; - iph->ttl = sk->protinfo.af_inet.ttl; + iph->ttl = ip_select_ttl(&sk->protinfo.af_inet, &rt->u.dst); iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->protocol = sk->protocol; @@ -152,15 +158,34 @@ } ip_send_check(iph); + skb->priority = sk->priority; + /* Send it out. */ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - output_maybe_reroute); + dst_output); } static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; struct hh_cache *hh = dst->hh; + struct net_device *dev = dst->dev; + int hh_len = LL_RESERVED_SPACE(dev); + + /* Be paranoid, rather than too clever. */ + if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { + struct sk_buff *skb2; + + skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); + if (skb2 == NULL) { + kfree_skb(skb); + return -ENOMEM; + } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + kfree_skb(skb); + skb = skb2; + } #ifdef CONFIG_NETFILTER_DEBUG nf_debug_ip_finish_output2(skb); @@ -184,7 +209,7 @@ return -EINVAL; } -__inline__ int ip_finish_output(struct sk_buff *skb) +int ip_finish_output(struct sk_buff *skb) { struct net_device *dev = skb->dst->dev; @@ -205,10 +230,6 @@ * If the indicated interface is up and running, send the packet. */ IP_INC_STATS(IpOutRequests); -#ifdef CONFIG_IP_ROUTE_NAT - if (rt->rt_flags & RTCF_NAT) - ip_do_nat(skb); -#endif skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -253,90 +274,26 @@ newskb->dev, ip_dev_loopback_xmit); } - return ip_finish_output(skb); + if (skb->len > dst_pmtu(&rt->u.dst) || skb_shinfo(skb)->frag_list) + return ip_fragment(skb, ip_finish_output); + else + return ip_finish_output(skb); } int ip_output(struct sk_buff *skb) { -#ifdef CONFIG_IP_ROUTE_NAT - struct rtable *rt = (struct rtable*)skb->dst; -#endif - IP_INC_STATS(IpOutRequests); -#ifdef CONFIG_IP_ROUTE_NAT - if (rt->rt_flags&RTCF_NAT) - ip_do_nat(skb); + if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list) && +#ifdef NETIF_F_TSO + !skb_shinfo(skb)->tso_size +#else + 1 #endif - - return ip_finish_output(skb); -} - -/* Queues a packet to be sent, and starts the transmitter if necessary. - * This routine also needs to put in the total length and compute the - * checksum. We use to do this in two stages, ip_build_header() then - * this, but that scheme created a mess when routes disappeared etc. - * So we do it all here, and the TCP send engine has been changed to - * match. (No more unroutable FIN disasters, etc. wheee...) This will - * most likely make other reliable transport layers above IP easier - * to implement under Linux. - */ -static inline int ip_queue_xmit2(struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - struct rtable *rt = (struct rtable *)skb->dst; - struct net_device *dev; - struct iphdr *iph = skb->nh.iph; - - dev = rt->u.dst.dev; - - /* This can happen when the transport layer has segments queued - * with a cached route, and by the time we get here things are - * re-routed to a device with a different MTU than the original - * device. Sick, but we must cover it. - */ - if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) { - struct sk_buff *skb2; - - skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15); - kfree_skb(skb); - if (skb2 == NULL) - return -ENOMEM; - if (sk) - skb_set_owner_w(skb2, sk); - skb = skb2; - iph = skb->nh.iph; - } - - if (skb->len > rt->u.dst.pmtu) - goto fragment; - - ip_select_ident(iph, &rt->u.dst, sk); - - /* Add an IP checksum. */ - ip_send_check(iph); - - skb->priority = sk->priority; - return skb->dst->output(skb); - -fragment: - if (ip_dont_fragment(sk, &rt->u.dst)) { - /* Reject packet ONLY if TCP might fragment - * it itself, if were careful enough. - */ - NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big (len[%u] pmtu[%u]) to self\n", - skb->len, rt->u.dst.pmtu)); - - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(rt->u.dst.pmtu)); - kfree_skb(skb); - return -EMSGSIZE; - } - ip_select_ident(iph, &rt->u.dst, sk); - if (skb->ip_summed == CHECKSUM_HW && - (skb = skb_checksum_help(skb)) == NULL) - return -ENOMEM; - return ip_fragment(skb, skb->dst->output); + ) + return ip_fragment(skb, ip_finish_output); + else + return ip_finish_output(skb); } int ip_queue_xmit(struct sk_buff *skb, int ipfragok) @@ -345,6 +302,9 @@ struct ip_options *opt = sk->protinfo.af_inet.opt; struct rtable *rt; struct iphdr *iph; +#ifdef NETIF_F_TSO + u32 mtu; +#endif /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. @@ -363,14 +323,24 @@ if(opt && opt->srr) daddr = opt->faddr; - /* If this fails, retransmit mechanism of transport layer will - * keep trying until route appears or the connection times itself - * out. - */ - if (ip_route_output(&rt, daddr, sk->saddr, - RT_CONN_FLAGS(sk), - sk->bound_dev_if)) - goto no_route; + { + struct flowi fl = { .oif = sk->bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = sk->saddr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = sk->protocol, + .uli_u = { .ports = + { .sport = sk->sport, + .dport = sk->dport } } }; + + /* If this fails, retransmit mechanism of transport layer will + * keep trying until route appears or the connection times + * itself out. + */ + if (ip_route_output_flow(&rt, &fl, sk, 0)) + goto no_route; + } __sk_dst_set(sk, &rt->u.dst); sk->route_caps = rt->u.dst.dev->features; } @@ -388,7 +358,7 @@ iph->frag_off = htons(IP_DF); else iph->frag_off = 0; - iph->ttl = sk->protinfo.af_inet.ttl; + iph->ttl = ip_select_ttl(&sk->protinfo.af_inet, &rt->u.dst); iph->protocol = sk->protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; @@ -400,8 +370,30 @@ ip_options_build(skb, opt, sk->daddr, rt, 0); } +#ifdef NETIF_F_TSO + mtu = dst_pmtu(&rt->u.dst); + if (skb->len > mtu && (sk->route_caps&NETIF_F_TSO)) { + unsigned int hlen; + + /* Hack zone: all this must be done by TCP. */ + hlen = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2)); + skb_shinfo(skb)->tso_size = mtu - hlen; + skb_shinfo(skb)->tso_segs = + (skb->len - hlen + skb_shinfo(skb)->tso_size - 1)/ + skb_shinfo(skb)->tso_size - 1; + } + ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs); +#else + ip_select_ident(iph, &rt->u.dst, sk); +#endif + + /* Add an IP checksum. */ + ip_send_check(iph); + + skb->priority = sk->priority; + return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - ip_queue_xmit2); + dst_output); no_route: IP_INC_STATS(IpOutNoRoutes); @@ -409,336 +401,32 @@ return -EHOSTUNREACH; } -/* - * Build and send a packet, with as little as one copy - * - * Doesn't care much about ip options... option length can be - * different for fragment at 0 and other fragments. - * - * Note that the fragment at the highest offset is sent first, - * so the getfrag routine can fill in the TCP/UDP checksum header - * field in the last fragment it sends... actually it also helps - * the reassemblers, they can put most packets in at the head of - * the fragment queue, and they know the total size in advance. This - * last feature will measurably improve the Linux fragment handler one - * day. - * - * The callback has five args, an arbitrary pointer (copy of frag), - * the source IP address (may depend on the routing table), the - * destination address (char *), the offset to copy from, and the - * length to be copied. - */ - -static int ip_build_xmit_slow(struct sock *sk, - int getfrag (const void *, - char *, - unsigned int, - unsigned int), - const void *frag, - unsigned length, - struct ipcm_cookie *ipc, - struct rtable *rt, - int flags) -{ - unsigned int fraglen, maxfraglen, fragheaderlen; - int err; - int offset, mf; - int mtu; - u16 id; - - int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15; - int nfrags=0; - struct ip_options *opt = ipc->opt; - int df = 0; - - mtu = rt->u.dst.pmtu; - if (ip_dont_fragment(sk, &rt->u.dst)) - df = htons(IP_DF); - - length -= sizeof(struct iphdr); - - if (opt) { - fragheaderlen = sizeof(struct iphdr) + opt->optlen; - maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen; - } else { - fragheaderlen = sizeof(struct iphdr); - - /* - * Fragheaderlen is the size of 'overhead' on each buffer. Now work - * out the size of the frames to send. - */ - - maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen; - } - - if (length + fragheaderlen > 0xFFFF) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu); - return -EMSGSIZE; - } - - /* - * Start at the end of the frame by handling the remainder. - */ - - offset = length - (length % (maxfraglen - fragheaderlen)); - - /* - * Amount of memory to allocate for final fragment. - */ - - fraglen = length - offset + fragheaderlen; - - if (length-offset==0) { - fraglen = maxfraglen; - offset -= maxfraglen-fragheaderlen; - } - - /* - * The last fragment will not have MF (more fragments) set. - */ - - mf = 0; - - /* - * Don't fragment packets for path mtu discovery. - */ - - if (offset > 0 && sk->protinfo.af_inet.pmtudisc==IP_PMTUDISC_DO) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu); - return -EMSGSIZE; - } - if (flags&MSG_PROBE) - goto out; - - /* - * Begin outputting the bytes. - */ - - id = sk->protinfo.af_inet.id++; - - do { - char *data; - struct sk_buff * skb; - - /* - * Get the memory we require with some space left for alignment. - */ - if (!(flags & MSG_DONTWAIT) || nfrags == 0) { - skb = sock_alloc_send_skb(sk, fraglen + hh_len + 15, - (flags & MSG_DONTWAIT), &err); - } else { - /* On a non-blocking write, we check for send buffer - * usage on the first fragment only. - */ - skb = sock_wmalloc(sk, fraglen + hh_len + 15, 1, - sk->allocation); - if (!skb) - err = -ENOBUFS; - } - if (skb == NULL) - goto error; - - /* - * Fill in the control structures - */ - - skb->priority = sk->priority; - skb->dst = dst_clone(&rt->u.dst); - skb_reserve(skb, hh_len); - - /* - * Find where to start putting bytes. - */ - - data = skb_put(skb, fraglen); - skb->nh.iph = (struct iphdr *)data; - - /* - * Only write IP header onto non-raw packets - */ - - { - struct iphdr *iph = (struct iphdr *)data; - - iph->version = 4; - iph->ihl = 5; - if (opt) { - iph->ihl += opt->optlen>>2; - ip_options_build(skb, opt, - ipc->addr, rt, offset); - } - iph->tos = sk->protinfo.af_inet.tos; - iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4); - iph->frag_off = htons(offset>>3)|mf|df; - iph->id = id; - if (!mf) { - if (offset || !df) { - /* Select an unpredictable ident only - * for packets without DF or having - * been fragmented. - */ - __ip_select_ident(iph, &rt->u.dst); - id = iph->id; - } - - /* - * Any further fragments will have MF set. - */ - mf = htons(IP_MF); - } - if (rt->rt_type == RTN_MULTICAST) - iph->ttl = sk->protinfo.af_inet.mc_ttl; - else - iph->ttl = sk->protinfo.af_inet.ttl; - iph->protocol = sk->protocol; - iph->check = 0; - iph->saddr = rt->rt_src; - iph->daddr = rt->rt_dst; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - data += iph->ihl*4; - } - - /* - * User data callback - */ - - if (getfrag(frag, data, offset, fraglen-fragheaderlen)) { - err = -EFAULT; - kfree_skb(skb); - goto error; - } - - offset -= (maxfraglen-fragheaderlen); - fraglen = maxfraglen; - - nfrags++; - - err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, - skb->dst->dev, output_maybe_reroute); - if (err) { - if (err > 0) - err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0; - if (err) - goto error; - } - } while (offset >= 0); - - if (nfrags>1) - ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags; -out: - return 0; - -error: - IP_INC_STATS(IpOutDiscards); - if (nfrags>1) - ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags; - return err; -} - -/* - * Fast path for unfragmented packets. - */ -int ip_build_xmit(struct sock *sk, - int getfrag (const void *, - char *, - unsigned int, - unsigned int), - const void *frag, - unsigned length, - struct ipcm_cookie *ipc, - struct rtable *rt, - int flags) +static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { - int err; - struct sk_buff *skb; - int df; - struct iphdr *iph; - - /* - * Try the simple case first. This leaves fragmented frames, and by - * choice RAW frames within 20 bytes of maximum size(rare) to the long path - */ - - if (!sk->protinfo.af_inet.hdrincl) { - length += sizeof(struct iphdr); - - /* - * Check for slow path. - */ - if (length > rt->u.dst.pmtu || ipc->opt != NULL) - return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags); - } else { - if (length > rt->u.dst.dev->mtu) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu); - return -EMSGSIZE; - } - } - if (flags&MSG_PROBE) - goto out; + to->pkt_type = from->pkt_type; + to->priority = from->priority; + to->protocol = from->protocol; + to->security = from->security; + to->dst = dst_clone(from->dst); + to->dev = from->dev; - /* - * Do path mtu discovery if needed. - */ - df = 0; - if (ip_dont_fragment(sk, &rt->u.dst)) - df = htons(IP_DF); + /* Copy the flags to each fragment. */ + IPCB(to)->flags = IPCB(from)->flags; - /* - * Fast path for unfragmented frames without options. - */ - { - int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15; - - skb = sock_alloc_send_skb(sk, length+hh_len+15, - flags&MSG_DONTWAIT, &err); - if(skb==NULL) - goto error; - skb_reserve(skb, hh_len); - } - - skb->priority = sk->priority; - skb->dst = dst_clone(&rt->u.dst); - - skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); - - if(!sk->protinfo.af_inet.hdrincl) { - iph->version=4; - iph->ihl=5; - iph->tos=sk->protinfo.af_inet.tos; - iph->tot_len = htons(length); - iph->frag_off = df; - iph->ttl=sk->protinfo.af_inet.mc_ttl; - ip_select_ident(iph, &rt->u.dst, sk); - if (rt->rt_type != RTN_MULTICAST) - iph->ttl=sk->protinfo.af_inet.ttl; - iph->protocol=sk->protocol; - iph->saddr=rt->rt_src; - iph->daddr=rt->rt_dst; - iph->check=0; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4); - } - else - err = getfrag(frag, (void *)iph, 0, length); - - if (err) - goto error_fault; - - err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - output_maybe_reroute); - if (err > 0) - err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0; - if (err) - goto error; -out: - return 0; - -error_fault: - err = -EFAULT; - kfree_skb(skb); -error: - IP_INC_STATS(IpOutDiscards); - return err; +#ifdef CONFIG_NET_SCHED + to->tc_index = from->tc_index; +#endif +#ifdef CONFIG_NETFILTER + to->nfmark = from->nfmark; + to->nfcache = from->nfcache; + /* Connection association is same as pre-frag packet */ + nf_conntrack_put(to->nfct); + to->nfct = from->nfct; + nf_conntrack_get(to->nfct); +#ifdef CONFIG_NETFILTER_DEBUG + to->nf_debug = from->nf_debug; +#endif +#endif } /* @@ -746,8 +434,6 @@ * smaller pieces (each of size equal to IP header plus * a block of the data of the original IP data part) that will yet fit in a * single device frame, and queue such a frame for sending. - * - * Yes this is inefficient, feel free to submit a quicker one. */ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) @@ -771,13 +457,111 @@ iph = skb->nh.iph; + if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(dst_pmtu(&rt->u.dst))); + kfree_skb(skb); + return -EMSGSIZE; + } + /* * Setup starting values. */ hlen = iph->ihl * 4; + mtu = dst_pmtu(&rt->u.dst) - hlen; /* Size of data space */ + + /* When frag_list is given, use it. First, check its validity: + * some transformers could create wrong frag_list or break existing + * one, it is not prohibited. In this case fall back to copying. + * + * LATER: this step can be merged to real generation of fragments, + * we can switch to copy when see the first bad fragment. + */ + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *frag; + int first_len = skb_pagelen(skb); + + if (first_len - hlen > mtu || + ((first_len - hlen) & 7) || + (iph->frag_off & htons(IP_MF|IP_OFFSET)) || + skb_cloned(skb)) + goto slow_path; + + for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + /* Correct geometry. */ + if (frag->len > mtu || + ((frag->len & 7) && frag->next) || + skb_headroom(frag) < hlen) + goto slow_path; + + /* Correct socket ownership. */ + if (frag->sk == NULL) + goto slow_path; + + /* Partially cloned skb? */ + if (skb_shared(frag)) + goto slow_path; + } + + /* Everything is OK. Generate! */ + + err = 0; + offset = 0; + frag = skb_shinfo(skb)->frag_list; + skb_shinfo(skb)->frag_list = 0; + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + iph->tot_len = htons(first_len); + iph->frag_off |= htons(IP_MF); + ip_send_check(iph); + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. */ + if (frag) { + frag->h.raw = frag->data; + frag->nh.raw = __skb_push(frag, hlen); + memcpy(frag->nh.raw, iph, hlen); + iph = frag->nh.iph; + iph->tot_len = htons(frag->len); + ip_copy_metadata(frag, skb); + if (offset == 0) + ip_options_fragment(frag); + offset += skb->len - hlen; + iph->frag_off = htons(offset>>3); + if (frag->next != NULL) + iph->frag_off |= htons(IP_MF); + /* Ready, complete checksum */ + ip_send_check(iph); + } + + err = output(skb); + + if (err || !frag) + break; + + skb = frag; + frag = skb->next; + skb->next = NULL; + } + + if (err == 0) { + IP_INC_STATS(IpFragOKs); + return 0; + } + + while (frag) { + skb = frag->next; + kfree_skb(frag); + frag = skb; + } + IP_INC_STATS(IpFragFails); + return err; + } + +slow_path: left = skb->len - hlen; /* Space per frame */ - mtu = rt->u.dst.pmtu - hlen; /* Size of data space */ ptr = raw + hlen; /* Where to start from */ /* @@ -805,7 +589,7 @@ * Allocate buffer. */ - if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) { + if ((skb2 = alloc_skb(len+hlen+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) { NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); err = -ENOMEM; goto fail; @@ -815,14 +599,11 @@ * Set up data on packet */ - skb2->pkt_type = skb->pkt_type; - skb2->priority = skb->priority; - skb_reserve(skb2, (dev->hard_header_len+15)&~15); + ip_copy_metadata(skb2, skb); + skb_reserve(skb2, LL_RESERVED_SPACE(rt->u.dst.dev)); skb_put(skb2, len + hlen); skb2->nh.raw = skb2->data; skb2->h.raw = skb2->data + hlen; - skb2->protocol = skb->protocol; - skb2->security = skb->security; /* * Charge the memory for the fragment to any owner @@ -831,8 +612,6 @@ if (skb->sk) skb_set_owner_w(skb2, skb->sk); - skb2->dst = dst_clone(skb->dst); - skb2->dev = skb->dev; /* * Copy the packet header into the new buffer. @@ -862,9 +641,6 @@ if (offset == 0) ip_options_fragment(skb); - /* Copy the flags to each fragment. */ - IPCB(skb2)->flags = IPCB(skb)->flags; - /* * Added AC : If we are fragmenting a fragment that's not the * last fragment then keep MF on each bit @@ -874,20 +650,6 @@ ptr += len; offset += len; -#ifdef CONFIG_NET_SCHED - skb2->tc_index = skb->tc_index; -#endif -#ifdef CONFIG_NETFILTER - skb2->nfmark = skb->nfmark; - skb2->nfcache = skb->nfcache; - /* Connection association is same as pre-frag packet */ - skb2->nfct = skb->nfct; - nf_conntrack_get(skb2->nfct); -#ifdef CONFIG_NETFILTER_DEBUG - skb2->nf_debug = skb->nf_debug; -#endif -#endif - /* * Put this fragment into the sending queue. */ @@ -912,40 +674,552 @@ return err; } +int +ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) +{ + struct iovec *iov = from; + + if (skb->ip_summed == CHECKSUM_HW) { + if (memcpy_fromiovecend(to, iov, offset, len) < 0) + return -EFAULT; + } else { + unsigned int csum = 0; + if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0) + return -EFAULT; + skb->csum = csum_block_add(skb->csum, csum, odd); + } + return 0; +} + +static inline int +skb_can_coalesce(struct sk_buff *skb, int i, struct page *page, int off) +{ + if (i) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; + return page == frag->page && + off == frag->page_offset+frag->size; + } + return 0; +} + +static inline unsigned int +csum_page(struct page *page, int offset, int copy) +{ + char *kaddr; + unsigned int csum; + kaddr = kmap(page); + csum = csum_partial(kaddr + offset, copy, 0); + kunmap(page); + return csum; +} + /* - * Fetch data from kernel space and fill in checksum if needed. + * ip_append_data() and ip_append_page() can make one large IP datagram + * from many pieces of data. Each pieces will be holded on the socket + * until ip_push_pending_frames() is called. Eache pieces can be a page + * or non-page data. + * + * Not only UDP, other transport protocols - e.g. raw sockets - can use + * this interface potentially. + * + * LATER: length must be adjusted by pad at tail, when it is required. */ -static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset, - unsigned int fraglen) +int ip_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + struct ipcm_cookie *ipc, struct rtable *rt, + unsigned int flags) { - struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr; - u16 *pktp = (u16 *)to; - struct iovec *iov; - int len; - int hdrflag = 1; - - iov = &dp->iov[0]; - if (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - hdrflag = 0; - } - len = iov->iov_len - offset; - if (fraglen > len) { /* overlapping. */ - dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len, - dp->csum); - offset = 0; - fraglen -= len; - to += len; - iov++; + struct inet_opt *inet = inet_sk(sk); + struct sk_buff *skb; + + struct ip_options *opt = NULL; + int hh_len; + int exthdrlen; + int mtu; + int copy; + int err; + int offset = 0; + unsigned int maxfraglen, fragheaderlen; + int csummode = CHECKSUM_NONE; + + if (flags&MSG_PROBE) + return 0; + + if (skb_queue_empty(&sk->write_queue)) { + /* + * setup for corking. + */ + opt = ipc->opt; + if (opt) { + if (inet->cork.opt == NULL) + inet->cork.opt = kmalloc(sizeof(struct ip_options)+40, sk->allocation); + memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); + inet->cork.flags |= IPCORK_OPT; + inet->cork.addr = ipc->addr; + } + dst_hold(&rt->u.dst); + inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst); + inet->cork.rt = rt; + inet->cork.length = 0; + inet->sndmsg_page = NULL; + inet->sndmsg_off = 0; + if ((exthdrlen = rt->u.dst.header_len) != 0) { + length += exthdrlen; + transhdrlen += exthdrlen; + } + } else { + rt = inet->cork.rt; + if (inet->cork.flags & IPCORK_OPT) + opt = inet->cork.opt; + + transhdrlen = 0; + exthdrlen = 0; + mtu = inet->cork.fragsize; + } + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + + fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); + maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen; + + if (inet->cork.length + length > 0xFFFF - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu-exthdrlen); + return -EMSGSIZE; } - dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen, - dp->csum); + /* + * transhdrlen > 0 means that this is the first fragment and we wish + * it won't be fragmented in the future. + */ + if (transhdrlen && + length + fragheaderlen <= maxfraglen && + rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && + !exthdrlen) + csummode = CHECKSUM_HW; + + inet->cork.length += length; + + /* So, what's going on in the loop below? + * + * We use calculated fragment length to generate chained skb, + * each of segments is IP fragment ready for sending to network after + * adding appropriate IP header. + * + * Mistake is: + * + * If mtu-fragheaderlen is not 0 modulo 8, we generate additional + * small fragment of length (mtu-fragheaderlen)%8, even though + * it is not necessary. Not a big bug, but needs a fix. + */ + + if ((skb = skb_peek_tail(&sk->write_queue)) == NULL) + goto alloc_new_skb; + + while (length > 0) { + if ((copy = maxfraglen - skb->len) <= 0) { + char *data; + unsigned int datalen; + unsigned int fraglen; + unsigned int alloclen; + BUG_TRAP(copy == 0); + +alloc_new_skb: + datalen = maxfraglen - fragheaderlen; + if (datalen > length) + datalen = length; + + fraglen = datalen + fragheaderlen; + if ((flags & MSG_MORE) && + !(rt->u.dst.dev->features&NETIF_F_SG)) + alloclen = maxfraglen; + else + alloclen = datalen + fragheaderlen; + + /* The last fragment gets additional space at tail. + * Note, with MSG_MORE we overallocate on fragments, + * because we have no idea what fragment will be + * the last. + */ + if (datalen == length) + alloclen += rt->u.dst.trailer_len; + + if (transhdrlen) { + skb = sock_alloc_send_skb(sk, + alloclen + hh_len + 15, + (flags & MSG_DONTWAIT), &err); + } else { + skb = NULL; + if (atomic_read(&sk->wmem_alloc) <= 2*sk->sndbuf) + skb = sock_wmalloc(sk, + alloclen + hh_len + 15, 1, + sk->allocation); + if (unlikely(skb == NULL)) + err = -ENOBUFS; + } + if (skb == NULL) + goto error; + + /* + * Fill in the control structures + */ + skb->ip_summed = csummode; + skb->csum = 0; + skb_reserve(skb, hh_len); + + /* + * Find where to start putting bytes. + */ + data = skb_put(skb, fraglen); + skb->nh.raw = data + exthdrlen; + data += fragheaderlen; + skb->h.raw = data + exthdrlen; + + copy = datalen - transhdrlen; + if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } - if (hdrflag && dp->csumoffset) - *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */ - return 0; + offset += copy; + length -= datalen; + transhdrlen = 0; + exthdrlen = 0; + csummode = CHECKSUM_NONE; + + /* + * Put the packet on the pending queue. + */ + __skb_queue_tail(&sk->write_queue, skb); + continue; + } + + if (copy > length) + copy = length; + + if (!(rt->u.dst.dev->features&NETIF_F_SG)) { + unsigned int off; + + off = skb->len; + if (getfrag(from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { + __skb_trim(skb, off); + err = -EFAULT; + goto error; + } + } else { + int i = skb_shinfo(skb)->nr_frags; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; + struct page *page = inet->sndmsg_page; + int off = inet->sndmsg_off; + unsigned int left; + + if (page && (left = PAGE_SIZE - off) > 0) { + if (copy >= left) + copy = left; + if (page != frag->page) { + if (i == MAX_SKB_FRAGS) { + err = -EMSGSIZE; + goto error; + } + get_page(page); + skb_fill_page_desc(skb, i, page, inet->sndmsg_off, 0); + frag = &skb_shinfo(skb)->frags[i]; + } + } else if (i < MAX_SKB_FRAGS) { + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + page = alloc_pages(sk->allocation, 0); + if (page == NULL) { + err = -ENOMEM; + goto error; + } + inet->sndmsg_page = page; + inet->sndmsg_off = 0; + + skb_fill_page_desc(skb, i, page, 0, 0); + frag = &skb_shinfo(skb)->frags[i]; + skb->truesize += PAGE_SIZE; + atomic_add(PAGE_SIZE, &sk->wmem_alloc); + } else { + err = -EMSGSIZE; + goto error; + } + if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { + err = -EFAULT; + goto error; + } + inet->sndmsg_off += copy; + frag->size += copy; + skb->len += copy; + skb->data_len += copy; + } + offset += copy; + length -= copy; + } + + return 0; + +error: + inet->cork.length -= length; + IP_INC_STATS(IpOutDiscards); + return err; +} + +ssize_t ip_append_page(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct inet_opt *inet = inet_sk(sk); + struct sk_buff *skb; + struct rtable *rt; + struct ip_options *opt = NULL; + int hh_len; + int mtu; + int len; + int err; + unsigned int maxfraglen, fragheaderlen; + + if (inet->hdrincl) + return -EPERM; + + if (flags&MSG_PROBE) + return 0; + + if (skb_queue_empty(&sk->write_queue)) + return -EINVAL; + + rt = inet->cork.rt; + if (inet->cork.flags & IPCORK_OPT) + opt = inet->cork.opt; + + if (!(rt->u.dst.dev->features&NETIF_F_SG)) + return -EOPNOTSUPP; + + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + mtu = inet->cork.fragsize; + + fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); + maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen; + + if (inet->cork.length + size > 0xFFFF - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu); + return -EMSGSIZE; + } + + if ((skb = skb_peek_tail(&sk->write_queue)) == NULL) + return -EINVAL; + + inet->cork.length += size; + + while (size > 0) { + int i; + if ((len = maxfraglen - skb->len) <= 0) { + char *data; + struct iphdr *iph; + BUG_TRAP(len == 0); + + skb = sock_wmalloc(sk, fragheaderlen + hh_len + 15, 1, + sk->allocation); + if (unlikely(!skb)) { + err = -ENOBUFS; + goto error; + } + + /* + * Fill in the control structures + */ + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + skb_reserve(skb, hh_len); + + /* + * Find where to start putting bytes. + */ + data = skb_put(skb, fragheaderlen); + skb->nh.iph = iph = (struct iphdr *)data; + data += fragheaderlen; + skb->h.raw = data; + + /* + * Put the packet on the pending queue. + */ + __skb_queue_tail(&sk->write_queue, skb); + continue; + } + + i = skb_shinfo(skb)->nr_frags; + if (len > size) + len = size; + if (skb_can_coalesce(skb, i, page, offset)) { + skb_shinfo(skb)->frags[i-1].size += len; + } else if (i < MAX_SKB_FRAGS) { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, len); + } else { + err = -EMSGSIZE; + goto error; + } + + if (skb->ip_summed == CHECKSUM_NONE) { + unsigned int csum; + csum = csum_page(page, offset, len); + skb->csum = csum_block_add(skb->csum, csum, skb->len); + } + + skb->len += len; + skb->data_len += len; + offset += len; + size -= len; + } + return 0; + +error: + inet->cork.length -= size; + IP_INC_STATS(IpOutDiscards); + return err; +} + +/* + * Combined all pending IP fragments on the socket as one IP datagram + * and push them out. + */ +int ip_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb, *tmp_skb; + struct sk_buff **tail_skb; + struct inet_opt *inet = inet_sk(sk); + struct ip_options *opt = NULL; + struct rtable *rt = inet->cork.rt; + struct iphdr *iph; + int df = 0; + __u8 ttl; + int err = 0; + + if ((skb = __skb_dequeue(&sk->write_queue)) == NULL) + goto out; + tail_skb = &(skb_shinfo(skb)->frag_list); + + /* move skb->data to ip header from ext header */ + if (skb->data < skb->nh.raw) + __skb_pull(skb, skb->nh.raw - skb->data); + while ((tmp_skb = __skb_dequeue(&sk->write_queue)) != NULL) { + __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); + *tail_skb = tmp_skb; + tail_skb = &(tmp_skb->next); + skb->len += tmp_skb->len; + skb->data_len += tmp_skb->len; +#if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */ + skb->truesize += tmp_skb->truesize; + __sock_put(tmp_skb->sk); + tmp_skb->destructor = NULL; + tmp_skb->sk = NULL; +#endif + } + + /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow + * to fragment the frame generated here. No matter, what transforms + * how transforms change size of the packet, it will come out. + */ + if (inet->pmtudisc != IP_PMTUDISC_DO) + skb->local_df = 1; + + /* DF bit is set when we want to see DF on outgoing frames. + * If local_df is set too, we still allow to fragment this frame + * locally. */ + if (inet->pmtudisc == IP_PMTUDISC_DO || + (!skb_shinfo(skb)->frag_list && ip_dont_fragment(sk, &rt->u.dst))) + df = htons(IP_DF); + + if (inet->cork.flags & IPCORK_OPT) + opt = inet->cork.opt; + + if (rt->rt_type == RTN_MULTICAST) + ttl = inet->mc_ttl; + else + ttl = ip_select_ttl(inet, &rt->u.dst); + + iph = (struct iphdr *)skb->data; + iph->version = 4; + iph->ihl = 5; + if (opt) { + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, inet->cork.addr, rt, 0); + } + iph->tos = inet->tos; + iph->tot_len = htons(skb->len); + iph->frag_off = df; + if (!df) { + __ip_select_ident(iph, &rt->u.dst); + } else { + iph->id = htons(inet->id++); + } + iph->ttl = ttl; + iph->protocol = sk->protocol; + iph->saddr = rt->rt_src; + iph->daddr = rt->rt_dst; + ip_send_check(iph); + + skb->priority = sk->priority; + skb->dst = dst_clone(&rt->u.dst); + + /* Netfilter gets whole the not fragmented skb. */ + err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, + skb->dst->dev, dst_output); + if (err) { + if (err > 0) + err = inet->recverr ? net_xmit_errno(err) : 0; + if (err) + goto error; + } + +out: + inet->cork.flags &= ~IPCORK_OPT; + if (inet->cork.rt) { + ip_rt_put(inet->cork.rt); + inet->cork.rt = NULL; + } + return err; + +error: + IP_INC_STATS(IpOutDiscards); + goto out; +} + +/* + * Throw away all pending data on the socket. + */ +void ip_flush_pending_frames(struct sock *sk) +{ + struct inet_opt *inet = inet_sk(sk); + struct sk_buff *skb; + + while ((skb = __skb_dequeue_tail(&sk->write_queue)) != NULL) + kfree_skb(skb); + + inet->cork.flags &= ~IPCORK_OPT; + if (inet->cork.opt) { + kfree(inet->cork.opt); + inet->cork.opt = NULL; + } + if (inet->cork.rt) { + ip_rt_put(inet->cork.rt); + inet->cork.rt = NULL; + } +} + + +/* + * Fetch data from kernel space and fill in checksum if needed. + */ +static int ip_reply_glue_bits(void *dptr, char *to, int offset, + int len, int odd, struct sk_buff *skb) +{ + unsigned int csum; + + csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0); + skb->csum = csum_block_add(skb->csum, csum, odd); + return 0; } /* @@ -954,6 +1228,8 @@ * * Should run single threaded per socket because it uses the sock * structure to pass arguments. + * + * LATER: switch from ip_build_xmit to ip_append_* */ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, unsigned int len) @@ -979,8 +1255,19 @@ daddr = replyopts.opt.faddr; } - if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) - return; + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = rt->rt_spec_dst, + .tos = RT_TOS(skb->nh.iph->tos) } }, + /* Not quite clean, but right. */ + .uli_u = { .ports = + { .sport = skb->h.th->dest, + .dport = skb->h.th->source } }, + .proto = sk->protocol }; + if (ip_route_output_key(&rt, &fl)) + return; + } /* And let IP do all the hard work. @@ -992,7 +1279,15 @@ sk->protinfo.af_inet.tos = skb->nh.iph->tos; sk->priority = skb->priority; sk->protocol = skb->nh.iph->protocol; - ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT); + ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, + &ipc, rt, MSG_DONTWAIT); + if ((skb = skb_peek(&sk->write_queue)) != NULL) { + if (arg->csumoffset >= 0) + *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum)); + skb->ip_summed = CHECKSUM_NONE; + ip_push_pending_frames(sk); + } + bh_unlock_sock(sk); ip_rt_put(rt); Index: net/ipv4/ip_sockglue.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/ip_sockglue.c,v retrieving revision 1.1.1.22 retrieving revision 1.1.1.22.2.1 diff -u -r1.1.1.22 -r1.1.1.22.2.1 --- a/net/ipv4/ip_sockglue.c 14 Apr 2004 13:05:41 -0000 1.1.1.22 +++ b/net/ipv4/ip_sockglue.c 16 Apr 2004 13:16:22 -0000 1.1.1.22.2.1 @@ -36,6 +36,7 @@ #include #include #include +#include #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) #include #endif @@ -380,6 +381,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { + struct inet_opt *inet = inet_sk(sk); int val=0,err; if (level != SOL_IP) @@ -431,8 +433,10 @@ (!((1<state)&(TCPF_LISTEN|TCPF_CLOSE)) && sk->daddr != LOOPBACK4_IPV6)) { #endif + if (inet->opt) + tp->ext_header_len -= inet->opt->optlen; if (opt) - tp->ext_header_len = opt->optlen; + tp->ext_header_len += opt->optlen; tcp_sync_mss(sk, tp->pmtu_cookie); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) } @@ -492,11 +496,9 @@ case IP_TTL: if (optlen<1) goto e_inval; - if(val==-1) - val = sysctl_ip_default_ttl; - if(val<1||val>255) + if (val != -1 && (val < 1 || val>255)) goto e_inval; - sk->protinfo.af_inet.ttl=val; + sk->protinfo.af_inet.uc_ttl = val; break; case IP_HDRINCL: if(sk->type!=SOCK_RAW) { @@ -839,6 +841,11 @@ sk->protinfo.af_inet.freebind = !!val; break; + case IP_IPSEC_POLICY: + case IP_XFRM_POLICY: + err = xfrm_user_policy(sk, optname, optval, optlen); + break; + default: #ifdef CONFIG_NETFILTER err = nf_setsockopt(sk, PF_INET, optname, optval, @@ -926,7 +933,9 @@ val=sk->protinfo.af_inet.tos; break; case IP_TTL: - val=sk->protinfo.af_inet.ttl; + val = (sk->protinfo.af_inet.uc_ttl == -1 ? + sysctl_ip_default_ttl : + sk->protinfo.af_inet.uc_ttl); break; case IP_HDRINCL: val=sk->protinfo.af_inet.hdrincl; @@ -940,7 +949,7 @@ val = 0; dst = sk_dst_get(sk); if (dst) { - val = dst->pmtu; + val = dst_pmtu(dst) - dst->header_len; dst_release(dst); } if (!val) { Index: net/ipv4/ipcomp.c =================================================================== RCS file: net/ipv4/ipcomp.c diff -N net/ipv4/ipcomp.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv4/ipcomp.c 16 Apr 2004 13:16:22 -0000 1.7.2.1 @@ -0,0 +1,446 @@ +/* + * IP Payload Compression Protocol (IPComp) - RFC3173. + * + * Copyright (c) 2003 James Morris + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Todo: + * - Tunable compression parameters. + * - Compression stats. + * - Adaptive compression. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) +{ + int err, plen, dlen; + struct iphdr *iph; + struct ipcomp_data *ipcd = x->data; + u8 *start, *scratch = ipcd->scratch; + + plen = skb->len; + dlen = IPCOMP_SCRATCH_SIZE; + start = skb->data; + + err = crypto_comp_decompress(ipcd->tfm, start, plen, scratch, &dlen); + if (err) + goto out; + + if (dlen < (plen + sizeof(struct ip_comp_hdr))) { + err = -EINVAL; + goto out; + } + + err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC); + if (err) + goto out; + + skb_put(skb, dlen - plen); + memcpy(skb->data, scratch, dlen); + iph = skb->nh.iph; + iph->tot_len = htons(dlen + iph->ihl * 4); +out: + return err; +} + +static int ipcomp_input(struct xfrm_state *x, + struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + u8 nexthdr; + int err = 0; + struct iphdr *iph; + union { + struct iphdr iph; + char buf[60]; + } tmp_iph; + + + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { + err = -ENOMEM; + goto out; + } + + skb->ip_summed = CHECKSUM_NONE; + + /* Remove ipcomp header and decompress original payload */ + iph = skb->nh.iph; + memcpy(&tmp_iph, iph, iph->ihl * 4); + nexthdr = *(u8 *)skb->data; + skb_pull(skb, sizeof(struct ip_comp_hdr)); + skb->nh.raw += sizeof(struct ip_comp_hdr); + memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4); + iph = skb->nh.iph; + iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr)); + iph->protocol = nexthdr; + skb->h.raw = skb->data; + err = ipcomp_decompress(x, skb); + +out: + return err; +} + +static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) +{ + int err, plen, dlen, ihlen; + struct iphdr *iph = skb->nh.iph; + struct ipcomp_data *ipcd = x->data; + u8 *start, *scratch = ipcd->scratch; + + ihlen = iph->ihl * 4; + plen = skb->len - ihlen; + dlen = IPCOMP_SCRATCH_SIZE; + start = skb->data + ihlen; + + err = crypto_comp_compress(ipcd->tfm, start, plen, scratch, &dlen); + if (err) + goto out; + + if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) { + err = -EMSGSIZE; + goto out; + } + + memcpy(start, scratch, dlen); + pskb_trim(skb, ihlen + dlen); + +out: + return err; +} + +static void ipcomp_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct iphdr *iph, *top_iph; + + iph = skb->nh.iph; + top_iph = (struct iphdr *)skb_push(skb, sizeof(struct iphdr)); + top_iph->ihl = 5; + top_iph->version = 4; + top_iph->tos = iph->tos; + top_iph->tot_len = htons(skb->len); + if (!(iph->frag_off&htons(IP_DF))) { +#ifdef NETIF_F_TSO + __ip_select_ident(top_iph, dst, 0); +#else + __ip_select_ident(top_iph, dst); +#endif + } + top_iph->ttl = iph->ttl; + top_iph->check = 0; + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + top_iph->frag_off = iph->frag_off&~htons(IP_MF|IP_OFFSET); + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->nh.raw = skb->data; +} + +static int ipcomp_output(struct sk_buff *skb) +{ + int err; + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct iphdr *iph, *top_iph; + struct ip_comp_hdr *ipch; + struct ipcomp_data *ipcd = x->data; + union { + struct iphdr iph; + char buf[60]; + } tmp_iph; + int hdr_len = 0; + + if (skb->ip_summed == CHECKSUM_HW && skb_checksum_help(skb) == NULL) { + err = -EINVAL; + goto error_nolock; + } + + spin_lock_bh(&x->lock); + err = xfrm_check_output(x, skb, AF_INET); + if (err) + goto error; + + /* Don't bother compressing */ + if (!x->props.mode) { + iph = skb->nh.iph; + hdr_len = iph->ihl * 4; + } + if ((skb->len - hdr_len) < ipcd->threshold) { + if (x->props.mode) { + ipcomp_tunnel_encap(x, skb); + iph = skb->nh.iph; + iph->protocol = IPPROTO_IPIP; + ip_send_check(iph); + } + goto out_ok; + } + + if (x->props.mode) + ipcomp_tunnel_encap(x, skb); + + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { + err = -ENOMEM; + goto error; + } + + err = ipcomp_compress(x, skb); + if (err) { + if (err == -EMSGSIZE) { + if (x->props.mode) { + iph = skb->nh.iph; + iph->protocol = IPPROTO_IPIP; + ip_send_check(iph); + } + goto out_ok; + } + goto error; + } + + /* Install ipcomp header, convert into ipcomp datagram. */ + iph = skb->nh.iph; + memcpy(&tmp_iph, iph, iph->ihl * 4); + top_iph = (struct iphdr *)skb_push(skb, sizeof(struct ip_comp_hdr)); + memcpy(top_iph, &tmp_iph, iph->ihl * 4); + iph = top_iph; + if (x->props.mode && (x->props.flags & XFRM_STATE_NOECN)) + IP_ECN_clear(iph); + iph->tot_len = htons(skb->len); + iph->protocol = IPPROTO_COMP; + iph->check = 0; + ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4); + ipch->nexthdr = x->props.mode ? IPPROTO_IPIP : tmp_iph.iph.protocol; + ipch->flags = 0; + ipch->cpi = htons((u16 )ntohl(x->id.spi)); + ip_send_check(iph); + skb->nh.raw = skb->data; + +out_ok: + x->curlft.bytes += skb->len; + x->curlft.packets++; + spin_unlock_bh(&x->lock); + + if ((skb->dst = dst_pop(dst)) == NULL) { + err = -EHOSTUNREACH; + goto error_nolock; + } + err = NET_XMIT_BYPASS; + +out_exit: + return err; +error: + spin_unlock_bh(&x->lock); +error_nolock: + kfree_skb(skb); + goto out_exit; +} + +static void ipcomp4_err(struct sk_buff *skb, u32 info) +{ + u32 spi; + struct iphdr *iph = (struct iphdr *)skb->data; + struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); + struct xfrm_state *x; + + if (skb->h.icmph->type != ICMP_DEST_UNREACH || + skb->h.icmph->code != ICMP_FRAG_NEEDED) + return; + + spi = ntohl(ntohs(ipch->cpi)); + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, + spi, IPPROTO_COMP, AF_INET); + if (!x) + return; + printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", + spi, NIPQUAD(iph->daddr)); + xfrm_state_put(x); +} + +/* We always hold one tunnel user reference to indicate a tunnel */ +static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) +{ + struct xfrm_state *t; + + t = xfrm_state_alloc(); + if (t == NULL) + goto out; + + t->id.proto = IPPROTO_IPIP; + t->id.spi = x->props.saddr.a4; + t->id.daddr.a4 = x->id.daddr.a4; + memcpy(&t->sel, &x->sel, sizeof(t->sel)); + t->props.family = AF_INET; + t->props.mode = 1; + t->props.saddr.a4 = x->props.saddr.a4; + t->props.flags = x->props.flags; + + t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family); + if (t->type == NULL) + goto error; + + if (t->type->init_state(t, NULL)) + goto error; + + t->km.state = XFRM_STATE_VALID; + atomic_set(&t->tunnel_users, 1); +out: + return t; + +error: + t->km.state = XFRM_STATE_DEAD; + xfrm_state_put(t); + t = NULL; + goto out; +} + +/* + * Must be protected by xfrm_cfg_sem. State and tunnel user references are + * always incremented on success. + */ +static int ipcomp_tunnel_attach(struct xfrm_state *x) +{ + int err = 0; + struct xfrm_state *t; + + t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4, + x->props.saddr.a4, IPPROTO_IPIP, AF_INET); + if (!t) { + t = ipcomp_tunnel_create(x); + if (!t) { + err = -EINVAL; + goto out; + } + xfrm_state_insert(t); + xfrm_state_hold(t); + } + x->tunnel = t; + atomic_inc(&t->tunnel_users); +out: + return err; +} + +static void ipcomp_free_data(struct ipcomp_data *ipcd) +{ + if (ipcd->tfm) + crypto_free_tfm(ipcd->tfm); + if (ipcd->scratch) + kfree(ipcd->scratch); +} + +static void ipcomp_destroy(struct xfrm_state *x) +{ + struct ipcomp_data *ipcd = x->data; + if (!ipcd) + return; + ipcomp_free_data(ipcd); + kfree(ipcd); +} + +static int ipcomp_init_state(struct xfrm_state *x, void *args) +{ + int err; + struct ipcomp_data *ipcd; + struct xfrm_algo_desc *calg_desc; + + err = -EINVAL; + if (!x->calg) + goto out; + + err = -ENOMEM; + ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL); + if (!ipcd) + goto error; + + memset(ipcd, 0, sizeof(*ipcd)); + x->props.header_len = sizeof(struct ip_comp_hdr); + if (x->props.mode) + x->props.header_len += sizeof(struct iphdr); + + ipcd->scratch = kmalloc(IPCOMP_SCRATCH_SIZE, GFP_KERNEL); + if (!ipcd->scratch) + goto error; + + ipcd->tfm = crypto_alloc_tfm(x->calg->alg_name, 0); + if (!ipcd->tfm) + goto error; + + if (x->props.mode) { + err = ipcomp_tunnel_attach(x); + if (err) + goto error; + } + + calg_desc = xfrm_calg_get_byname(x->calg->alg_name); + BUG_ON(!calg_desc); + ipcd->threshold = calg_desc->uinfo.comp.threshold; + x->data = ipcd; + err = 0; +out: + return err; + +error: + if (ipcd) { + ipcomp_free_data(ipcd); + kfree(ipcd); + } + goto out; +} + +static struct xfrm_type ipcomp_type = +{ + .description = "IPCOMP4", + .proto = IPPROTO_COMP, + .init_state = ipcomp_init_state, + .destructor = ipcomp_destroy, + .input = ipcomp_input, + .output = ipcomp_output +}; + +static struct inet_protocol ipcomp4_protocol = { + .handler = xfrm4_rcv, + .err_handler = ipcomp4_err, + .no_policy = 1, +}; + +static int __init ipcomp4_init(void) +{ + SET_MODULE_OWNER(&ipcomp_type); + if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) { + printk(KERN_INFO "ipcomp init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { + printk(KERN_INFO "ipcomp init: can't add protocol\n"); + xfrm_unregister_type(&ipcomp_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit ipcomp4_fini(void) +{ + if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) + printk(KERN_INFO "ip ipcomp close: can't remove protocol\n"); + if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) + printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n"); +} + +module_init(ipcomp4_init); +module_exit(ipcomp4_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173"); +MODULE_AUTHOR("James Morris "); + Index: net/ipv4/ipconfig.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/ipconfig.c,v retrieving revision 1.1.1.26 retrieving revision 1.1.1.26.2.1 diff -u -r1.1.1.26 -r1.1.1.26.2.1 --- a/net/ipv4/ipconfig.c 28 Nov 2003 18:26:21 -0000 1.1.1.26 +++ b/net/ipv4/ipconfig.c 16 Apr 2004 13:16:22 -0000 1.1.1.26.2.1 @@ -655,7 +655,7 @@ struct net_device *dev = d->dev; struct sk_buff *skb; struct bootp_pkt *b; - int hh_len = (dev->hard_header_len + 15) & ~15; + int hh_len = LL_RESERVED_SPACE(dev); struct iphdr *h; /* Allocate packet */ Index: net/ipv4/ipip.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/ipip.c,v retrieving revision 1.1.1.26 retrieving revision 1.1.1.26.2.1 diff -u -r1.1.1.26 -r1.1.1.26.2.1 --- a/net/ipv4/ipip.c 28 Nov 2003 18:26:21 -0000 1.1.1.26 +++ b/net/ipv4/ipip.c 16 Apr 2004 13:16:22 -0000 1.1.1.26.2.1 @@ -115,6 +115,7 @@ #include #include #include +#include #define HASH_SIZE 16 #define HASH(addr) ((addr^(addr>>4))&0xF) @@ -207,7 +208,7 @@ write_unlock_bh(&ipip_lock); } -struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create) +static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create) { u32 remote = parms->iph.daddr; u32 local = parms->iph.saddr; @@ -289,7 +290,7 @@ dev_put(dev); } -void ipip_err(struct sk_buff *skb, u32 info) +static void ipip_err(struct sk_buff *skb, void *__unused) { #ifndef I_WISH_WORLD_WERE_PERFECT @@ -355,6 +356,7 @@ int rel_code = 0; int rel_info = 0; struct sk_buff *skb2; + struct flowi fl; struct rtable *rt; if (len < hlen + sizeof(struct iphdr)) @@ -417,7 +419,11 @@ skb2->nh.raw = skb2->data; /* Try to guess incoming interface */ - if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) { + memset(&fl, 0, sizeof(fl)); + fl.fl4_daddr = eiph->saddr; + fl.fl4_tos = RT_TOS(eiph->tos); + fl.proto = IPPROTO_IPIP; + if (ip_route_output_key(&rt, &key)) { kfree_skb(skb2); return; } @@ -427,8 +433,11 @@ if (rt->rt_flags&RTCF_LOCAL) { ip_rt_put(rt); rt = NULL; - if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) || - rt->u.dst.dev->type != ARPHRD_IPGRE) { + fl.fl4_daddr = eiph->daddr; + fl.fl4_src = eiph->saddr; + fl.fl4_tos = eiph->tos; + if (ip_route_output_key(&rt, &fl) || + rt->u.dst.dev->type != ARPHRD_TUNNEL) { ip_rt_put(rt); kfree_skb(skb2); return; @@ -436,7 +445,7 @@ } else { ip_rt_put(rt); if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || - skb2->dst->dev->type != ARPHRD_IPGRE) { + skb2->dst->dev->type != ARPHRD_TUNNEL) { kfree_skb(skb2); return; } @@ -444,11 +453,11 @@ /* change mtu on this route */ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - if (rel_info > skb2->dst->pmtu) { + if (rel_info > dst_pmtu(skb2->dst)) { kfree_skb(skb2); return; } - skb2->dst->pmtu = rel_info; + skb2->dst->ops->update_pmtu(skb2->dst, rel_info); rel_info = htonl(rel_info); } else if (type == ICMP_TIME_EXCEEDED) { struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; @@ -473,7 +482,7 @@ IP_ECN_set_ce(inner_iph); } -int ipip_rcv(struct sk_buff *skb) +static int ipip_rcv(struct sk_buff *skb) { struct iphdr *iph; struct ip_tunnel *tunnel; @@ -482,14 +491,22 @@ goto out; iph = skb->nh.iph; - skb->mac.raw = skb->nh.raw; - skb->nh.raw = skb->data; - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); - skb->protocol = htons(ETH_P_IP); - skb->pkt_type = PACKET_HOST; read_lock(&ipip_lock); if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return 0; + } + + secpath_reset(skb); + + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb->data; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = htons(ETH_P_IP); + skb->pkt_type = PACKET_HOST; + tunnel->stat.rx_packets++; tunnel->stat.rx_bytes += skb->len; skb->dev = tunnel->dev; @@ -509,16 +526,8 @@ } read_unlock(&ipip_lock); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); out: - kfree_skb(skb); - return 0; -} - -/* Need this wrapper because NF_HOOK takes the function address */ -static inline int do_ip_send(struct sk_buff *skb) -{ - return ip_send(skb); + return -1; } /* @@ -562,9 +571,17 @@ goto tx_error_icmp; } - if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { - tunnel->stat.tx_carrier_errors++; - goto tx_error_icmp; + { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) } }, + .proto = IPPROTO_IPIP }; + if (ip_route_output_key(&rt, &fl)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error_icmp; + } } tdev = rt->u.dst.dev; @@ -575,17 +592,17 @@ } if (tiph->frag_off) - mtu = rt->u.dst.pmtu - sizeof(struct iphdr); + mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr); else - mtu = skb->dst ? skb->dst->pmtu : dev->mtu; + mtu = skb->dst ? dst_pmtu(skb->dst) : dev->mtu; if (mtu < 68) { tunnel->stat.collisions++; ip_rt_put(rt); goto tx_error; } - if (skb->dst && mtu < skb->dst->pmtu) - skb->dst->pmtu = mtu; + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); df |= (old_iph->frag_off&htons(IP_DF)); @@ -606,7 +623,7 @@ /* * Okay, now see if we can stuff it in the buffer as-is. */ - max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr)); + max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr)); if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); @@ -824,8 +841,14 @@ ipip_tunnel_init_gen(dev); if (iph->daddr) { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_IPIP }; struct rtable *rt; - if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) { + if (!ip_route_output_key(&rt, &fl)) { tdev = rt->u.dst.dev; ip_rt_put(rt); } @@ -858,7 +881,7 @@ } #endif -int __init ipip_fb_tunnel_init(struct net_device *dev) +static int __init ipip_fb_tunnel_init(struct net_device *dev) { struct iphdr *iph; @@ -878,11 +901,9 @@ return 0; } -static struct inet_protocol ipip_protocol = { - handler: ipip_rcv, - err_handler: ipip_err, - protocol: IPPROTO_IPIP, - name: "IPIP" +static struct xfrm_tunnel ipip_handler = { + .handler = ipip_rcv, + .err_handler = ipip_err, }; static char banner[] __initdata = @@ -892,16 +913,20 @@ { printk(banner); + if (xfrm4_tunnel_register(&ipip_handler) < 0) { + printk(KERN_INFO "ipip init: can't register tunnel\n"); + return -EAGAIN; + } + ipip_fb_tunnel_dev.priv = (void*)&ipip_fb_tunnel; register_netdev(&ipip_fb_tunnel_dev); - inet_add_protocol(&ipip_protocol); return 0; } static void __exit ipip_fini(void) { - if ( inet_del_protocol(&ipip_protocol) < 0 ) - printk(KERN_INFO "ipip close: can't remove protocol\n"); + if (xfrm4_tunnel_deregister(&ipip_handler) < 0) + printk(KERN_INFO "ipip close: can't deregister tunnel\n"); unregister_netdev(&ipip_fb_tunnel_dev); } Index: net/ipv4/ipmr.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/ipmr.c,v retrieving revision 1.1.1.23 retrieving revision 1.1.1.23.2.1 diff -u -r1.1.1.23 -r1.1.1.23.2.1 --- a/net/ipv4/ipmr.c 28 Nov 2003 18:26:21 -0000 1.1.1.23 +++ b/net/ipv4/ipmr.c 16 Apr 2004 13:16:22 -0000 1.1.1.23.2.1 @@ -108,7 +108,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); -extern struct inet_protocol pim_protocol; +static struct inet_protocol pim_protocol; static struct timer_list ipmr_expire_timer; @@ -928,23 +928,28 @@ #ifdef CONFIG_IP_PIMSM case MRT_PIM: { - int v; + int v, ret; if(get_user(v,(int *)optval)) return -EFAULT; v = (v)?1:0; rtnl_lock(); + ret = 0; if (v != mroute_do_pim) { mroute_do_pim = v; mroute_do_assert = v; #ifdef CONFIG_IP_PIMSM_V2 if (mroute_do_pim) - inet_add_protocol(&pim_protocol); + ret = inet_add_protocol(&pim_protocol, + IPPROTO_PIM); else - inet_del_protocol(&pim_protocol); + ret = inet_del_protocol(&pim_protocol, + IPPROTO_PIM); + if (ret < 0) + ret = -EAGAIN; #endif } rtnl_unlock(); - return 0; + return ret; } #endif /* @@ -1105,16 +1110,14 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) { - struct ip_options *opt = &(IPCB(skb)->opt); - struct dst_entry *dst = skb->dst; + struct ip_options * opt = &(IPCB(skb)->opt); + + IP_INC_STATS_BH(IpForwDatagrams); if (unlikely(opt->optlen)) ip_forward_options(skb); - if (skb->len <= dst->pmtu) - return dst->output(skb); - else - return ip_fragment(skb, dst->output); + return dst_output(skb); } /* @@ -1146,17 +1149,28 @@ #endif if (vif->flags&VIFF_TUNNEL) { - if (ip_route_output(&rt, vif->remote, vif->local, RT_TOS(iph->tos), vif->link)) + struct flowi fl = { .oif = vif->link, + .nl_u = { .ip4_u = + { .daddr = vif->remote, + .saddr = vif->local, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_IPIP }; + if (ip_route_output_key(&rt, &fl)) return; encap = sizeof(struct iphdr); } else { - if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos), vif->link)) + struct flowi fl = { .oif = vif->link, + .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_IPIP }; + if (ip_route_output_key(&rt, &fl)) return; } dev = rt->u.dst.dev; - if (skb->len+encap > rt->u.dst.pmtu && (ntohs(iph->frag_off) & IP_DF)) { + if (skb->len+encap > dst_pmtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not allow to send ICMP, so that packets will disappear to blackhole. @@ -1167,7 +1181,7 @@ return; } - encap += dev->hard_header_len; + encap += LL_RESERVED_SPACE(dev); if (skb_headroom(skb) < encap || skb_cloned(skb) || !last) skb2 = skb_realloc_headroom(skb, (encap + 15)&~15); @@ -1244,7 +1258,7 @@ if (vif_table[vif].dev != skb->dev) { int true_vifi; - if (((struct rtable*)skb->dst)->key.iif == 0) { + if (((struct rtable*)skb->dst)->fl.iif == 0) { /* It is our own packet, looped back. Very complicated situation... @@ -1394,19 +1408,15 @@ struct net_device *reg_dev = NULL; if (skb_is_nonlinear(skb)) { - if (skb_linearize(skb, GFP_ATOMIC) != 0) { - kfree_skb(skb); - return -ENOMEM; - } + if (skb_linearize(skb, GFP_ATOMIC) != 0) + goto drop; pim = (struct igmphdr*)skb->h.raw; } if (!mroute_do_pim || skb->len < sizeof(*pim) + sizeof(*encap) || - pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) { - kfree_skb(skb); - return -EINVAL; - } + pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) + goto drop; encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr)); /* @@ -1416,11 +1426,9 @@ c. packet is not truncated */ if (!MULTICAST(encap->daddr) || - ntohs(encap->tot_len) == 0 || - ntohs(encap->tot_len) + sizeof(*pim) > skb->len) { - kfree_skb(skb); - return -EINVAL; - } + encap->tot_len == 0 || + ntohs(encap->tot_len) + sizeof(*pim) > skb->len) + goto drop; read_lock(&mrt_lock); if (reg_vif_num >= 0) @@ -1429,10 +1437,8 @@ dev_hold(reg_dev); read_unlock(&mrt_lock); - if (reg_dev == NULL) { - kfree_skb(skb); - return -EINVAL; - } + if (reg_dev == NULL) + goto drop; skb->mac.raw = skb->nh.raw; skb_pull(skb, (u8*)encap - skb->data); @@ -1453,6 +1459,9 @@ netif_rx(skb); dev_put(reg_dev); return 0; + drop: + kfree_skb(skb); + return 0; } #endif @@ -1464,10 +1473,8 @@ struct net_device *reg_dev = NULL; if (skb_is_nonlinear(skb)) { - if (skb_linearize(skb, GFP_ATOMIC) != 0) { - kfree_skb(skb); - return -ENOMEM; - } + if (skb_linearize(skb, GFP_ATOMIC) != 0) + goto drop; pim = (struct pimreghdr*)skb->h.raw; } @@ -1475,19 +1482,15 @@ pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || (pim->flags&PIM_NULL_REGISTER) || (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && - ip_compute_csum((void *)pim, skb->len))) { - kfree_skb(skb); - return -EINVAL; - } + ip_compute_csum((void *)pim, skb->len))) + goto drop; /* check if the inner packet is destined to mcast group */ encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr)); if (!MULTICAST(encap->daddr) || - ntohs(encap->tot_len) == 0 || - ntohs(encap->tot_len) + sizeof(*pim) > skb->len) { - kfree_skb(skb); - return -EINVAL; - } + encap->tot_len == 0 || + ntohs(encap->tot_len) + sizeof(*pim) > skb->len) + goto drop; read_lock(&mrt_lock); if (reg_vif_num >= 0) @@ -1496,10 +1499,8 @@ dev_hold(reg_dev); read_unlock(&mrt_lock); - if (reg_dev == NULL) { - kfree_skb(skb); - return -EINVAL; - } + if (reg_dev == NULL) + goto drop; skb->mac.raw = skb->nh.raw; skb_pull(skb, (u8*)encap - skb->data); @@ -1520,6 +1521,9 @@ netif_rx(skb); dev_put(reg_dev); return 0; + drop: + kfree_skb(skb); + return 0; } #endif @@ -1732,15 +1736,8 @@ #endif #ifdef CONFIG_IP_PIMSM_V2 -struct inet_protocol pim_protocol = -{ - pim_rcv, /* PIM handler */ - NULL, /* PIM error control */ - NULL, /* next */ - IPPROTO_PIM, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "PIM" /* name */ +static struct inet_protocol pim_protocol = { + .handler = pim_rcv, }; #endif Index: net/ipv4/proc.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/proc.c,v retrieving revision 1.1.1.16 retrieving revision 1.1.1.16.2.1 diff -u -r1.1.1.16 -r1.1.1.16.2.1 --- a/net/ipv4/proc.c 13 Jun 2003 14:51:39 -0000 1.1.1.16 +++ b/net/ipv4/proc.c 16 Apr 2004 13:16:22 -0000 1.1.1.16.2.1 @@ -116,7 +116,6 @@ int snmp_get_info(char *buffer, char **start, off_t offset, int length) { - extern int sysctl_ip_default_ttl; int len, i; len = sprintf (buffer, Index: net/ipv4/protocol.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/protocol.c,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/net/ipv4/protocol.c 20 May 2001 00:56:43 -0000 1.1.1.15 +++ b/net/ipv4/protocol.c 16 Apr 2004 13:16:22 -0000 1.1.1.15.2.1 @@ -48,134 +48,52 @@ #include #include -#define IPPROTO_PREVIOUS NULL - -#ifdef CONFIG_IP_MULTICAST - -static struct inet_protocol igmp_protocol = { - handler: igmp_rcv, - next: IPPROTO_PREVIOUS, - protocol: IPPROTO_IGMP, - name: "IGMP" -}; - -#undef IPPROTO_PREVIOUS -#define IPPROTO_PREVIOUS &igmp_protocol - -#endif - -static struct inet_protocol tcp_protocol = { - handler: tcp_v4_rcv, - err_handler: tcp_v4_err, - next: IPPROTO_PREVIOUS, - protocol: IPPROTO_TCP, - name: "TCP" -}; - -#undef IPPROTO_PREVIOUS -#define IPPROTO_PREVIOUS &tcp_protocol - -static struct inet_protocol udp_protocol = { - handler: udp_rcv, - err_handler: udp_err, - next: IPPROTO_PREVIOUS, - protocol: IPPROTO_UDP, - name: "UDP" -}; - -#undef IPPROTO_PREVIOUS -#define IPPROTO_PREVIOUS &udp_protocol - -static struct inet_protocol icmp_protocol = { - handler: icmp_rcv, - next: IPPROTO_PREVIOUS, - protocol: IPPROTO_ICMP, - name: "ICMP" -}; - -#undef IPPROTO_PREVIOUS -#define IPPROTO_PREVIOUS &icmp_protocol - - -struct inet_protocol *inet_protocol_base = IPPROTO_PREVIOUS; - struct inet_protocol *inet_protos[MAX_INET_PROTOS]; /* * Add a protocol handler to the hash tables */ -void inet_add_protocol(struct inet_protocol *prot) +int inet_add_protocol(struct inet_protocol *prot, unsigned char protocol) { - unsigned char hash; - struct inet_protocol *p2; + int hash, ret; + + hash = protocol & (MAX_INET_PROTOS - 1); - hash = prot->protocol & (MAX_INET_PROTOS - 1); br_write_lock_bh(BR_NETPROTO_LOCK); - prot ->next = inet_protos[hash]; - inet_protos[hash] = prot; - prot->copy = 0; - - /* - * Set the copy bit if we need to. - */ - - p2 = (struct inet_protocol *) prot->next; - while (p2) { - if (p2->protocol == prot->protocol) { - prot->copy = 1; - break; - } - p2 = (struct inet_protocol *) p2->next; + + if (inet_protos[hash]) { + ret = -1; + } else { + inet_protos[hash] = prot; + ret = 0; } + br_write_unlock_bh(BR_NETPROTO_LOCK); + + return ret; } /* * Remove a protocol from the hash tables. */ -int inet_del_protocol(struct inet_protocol *prot) +int inet_del_protocol(struct inet_protocol *prot, unsigned char protocol) { - struct inet_protocol *p; - struct inet_protocol *lp = NULL; - unsigned char hash; - - hash = prot->protocol & (MAX_INET_PROTOS - 1); - br_write_lock_bh(BR_NETPROTO_LOCK); - if (prot == inet_protos[hash]) { - inet_protos[hash] = (struct inet_protocol *) inet_protos[hash]->next; - br_write_unlock_bh(BR_NETPROTO_LOCK); - return 0; - } + int hash, ret; - p = (struct inet_protocol *) inet_protos[hash]; + hash = protocol & (MAX_INET_PROTOS - 1); - if (p != NULL && p->protocol == prot->protocol) - lp = p; - - while (p) { - /* - * We have to worry if the protocol being deleted is - * the last one on the list, then we may need to reset - * someone's copied bit. - */ - if (p->next && p->next == prot) { - /* - * if we are the last one with this protocol and - * there is a previous one, reset its copy bit. - */ - if (prot->copy == 0 && lp != NULL) - lp->copy = 0; - p->next = prot->next; - br_write_unlock_bh(BR_NETPROTO_LOCK); - return 0; - } - if (p->next != NULL && p->next->protocol == prot->protocol) - lp = p->next; + br_write_lock_bh(BR_NETPROTO_LOCK); - p = (struct inet_protocol *) p->next; + if (inet_protos[hash] == prot) { + inet_protos[hash] = NULL; + ret = 0; + } else { + ret = -1; } + br_write_unlock_bh(BR_NETPROTO_LOCK); - return -1; + + return ret; } Index: net/ipv4/raw.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/raw.c,v retrieving revision 1.1.1.22 retrieving revision 1.1.1.22.2.1 diff -u -r1.1.1.22 -r1.1.1.22.2.1 --- a/net/ipv4/raw.c 25 Aug 2003 11:44:44 -0000 1.1.1.22 +++ b/net/ipv4/raw.c 16 Apr 2004 13:16:22 -0000 1.1.1.22.2.1 @@ -64,6 +64,8 @@ #include #include #include +#include +#include struct sock *raw_v4_htable[RAWV4_HTABLE_SIZE]; rwlock_t raw_v4_lock = RW_LOCK_UNLOCKED; @@ -132,13 +134,12 @@ } /* IP input processing comes here for RAW socket delivery. - * This is fun as to avoid copies we want to make no surplus - * copies. + * Caller owns SKB, so we must make clones. * * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ -struct sock *raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) +void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) { struct sock *sk; @@ -150,28 +151,19 @@ skb->dev->ifindex); while (sk) { - struct sock *sknext = __raw_v4_lookup(sk->next, iph->protocol, - iph->saddr, iph->daddr, - skb->dev->ifindex); - if (iph->protocol != IPPROTO_ICMP || - !icmp_filter(sk, skb)) { - struct sk_buff *clone; - - if (!sknext) - break; - clone = skb_clone(skb, GFP_ATOMIC); + if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { + struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); + /* Not releasing hash table! */ if (clone) raw_rcv(sk, clone); } - sk = sknext; + sk = __raw_v4_lookup(sk->next, iph->protocol, + iph->saddr, iph->daddr, + skb->dev->ifindex); } out: - if (sk) - sock_hold(sk); read_unlock(&raw_v4_lock); - - return sk; } void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) @@ -244,71 +236,92 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) { + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return NET_RX_DROP; + } + skb_push(skb, skb->data - skb->nh.raw); raw_rcv_skb(sk, skb); return 0; } -struct rawfakehdr -{ - struct iovec *iov; - u32 saddr; - struct dst_entry *dst; -}; +static int raw_send_hdrinc(struct sock *sk, void *from, int length, + struct rtable *rt, + unsigned int flags) +{ + struct inet_opt *inet = inet_sk(sk); + int hh_len; + struct iphdr *iph; + struct sk_buff *skb; + int err; -/* - * Send a RAW IP packet. - */ + if (length > rt->u.dst.dev->mtu) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, + rt->u.dst.dev->mtu); + return -EMSGSIZE; + } + if (flags&MSG_PROBE) + goto out; -/* - * Callback support is trivial for SOCK_RAW - */ - -static int raw_getfrag(const void *p, char *to, unsigned int offset, - unsigned int fraglen) -{ - struct rawfakehdr *rfh = (struct rawfakehdr *) p; - return memcpy_fromiovecend(to, rfh->iov, offset, fraglen); -} + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); -/* - * IPPROTO_RAW needs extra work. - */ - -static int raw_getrawfrag(const void *p, char *to, unsigned int offset, - unsigned int fraglen) -{ - struct rawfakehdr *rfh = (struct rawfakehdr *) p; + skb = sock_alloc_send_skb(sk, length+hh_len+15, + flags&MSG_DONTWAIT, &err); + if (skb == NULL) + goto error; + skb_reserve(skb, hh_len); - if (memcpy_fromiovecend(to, rfh->iov, offset, fraglen)) - return -EFAULT; + skb->priority = sk->priority; + skb->dst = dst_clone(&rt->u.dst); + + skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); + + skb->ip_summed = CHECKSUM_NONE; - if (!offset) { - struct iphdr *iph = (struct iphdr *)to; + skb->h.raw = skb->nh.raw; + err = memcpy_fromiovecend((void *)iph, from, 0, length); + if (err) + goto error_fault; + + /* We don't modify invalid header */ + if (length >= sizeof(*iph) && iph->ihl * 4 <= length) { if (!iph->saddr) - iph->saddr = rfh->saddr; + iph->saddr = rt->rt_src; iph->check = 0; - iph->tot_len = htons(fraglen); /* This is right as you can't - frag RAW packets */ - /* - * Deliberate breach of modularity to keep - * ip_build_xmit clean (well less messy). - */ + iph->tot_len = htons(length); if (!iph->id) - ip_select_ident(iph, rfh->dst, NULL); + ip_select_ident(iph, &rt->u.dst, NULL); + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } + + err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + dst_output); + if (err > 0) + err = inet->recverr ? net_xmit_errno(err) : 0; + if (err) + goto error; +out: return 0; + +error_fault: + err = -EFAULT; + kfree_skb(skb); +error: + IP_INC_STATS(IpOutDiscards); + return err; } static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len) { + struct inet_opt *inet = inet_sk(sk); struct ipcm_cookie ipc; - struct rawfakehdr rfh; struct rtable *rt = NULL; int free = 0; u32 daddr; + u32 saddr; u8 tos; int err; @@ -378,7 +391,7 @@ free = 1; } - rfh.saddr = ipc.addr; + saddr = ipc.addr; ipc.addr = daddr; if (!ipc.opt) @@ -404,12 +417,19 @@ if (MULTICAST(daddr)) { if (!ipc.oif) ipc.oif = sk->protinfo.af_inet.mc_index; - if (!rfh.saddr) - rfh.saddr = sk->protinfo.af_inet.mc_addr; + if (!saddr) + saddr = sk->protinfo.af_inet.mc_addr; } - err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif); - + { + struct flowi fl = { .oif = ipc.oif, + .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = saddr, + .tos = tos } }, + .proto = inet->hdrincl ? IPPROTO_RAW : sk->protocol }; + err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); + } if (err) goto done; @@ -421,14 +441,22 @@ goto do_confirm; back_from_confirm: - rfh.iov = msg->msg_iov; - rfh.saddr = rt->rt_src; - rfh.dst = &rt->u.dst; - if (!ipc.addr) - ipc.addr = rt->rt_dst; - err = ip_build_xmit(sk, sk->protinfo.af_inet.hdrincl ? raw_getrawfrag : - raw_getfrag, &rfh, len, &ipc, rt, msg->msg_flags); - + if (inet->hdrincl) + err = raw_send_hdrinc(sk, msg->msg_iov, len, + rt, msg->msg_flags); + + else { + if (!ipc.addr) + ipc.addr = rt->rt_dst; + lock_sock(sk); + err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, + &ipc, rt, msg->msg_flags); + if (err) + ip_flush_pending_frames(sk); + else if (!(msg->msg_flags & MSG_MORE)) + err = ip_push_pending_frames(sk); + release_sock(sk); + } done: if (free) kfree(ipc.opt); Index: net/ipv4/route.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/route.c,v retrieving revision 1.1.1.30 retrieving revision 1.1.1.30.2.1 diff -u -r1.1.1.30 -r1.1.1.30.2.1 --- a/net/ipv4/route.c 28 Nov 2003 18:26:21 -0000 1.1.1.30 +++ b/net/ipv4/route.c 16 Apr 2004 13:16:22 -0000 1.1.1.30.2.1 @@ -95,6 +95,7 @@ #include #include #include +#include #ifdef CONFIG_SYSCTL #include #endif @@ -132,11 +133,10 @@ */ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); -static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst, - struct sk_buff *skb); static void ipv4_dst_destroy(struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); +static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); static int rt_garbage_collect(void); @@ -145,10 +145,10 @@ protocol: __constant_htons(ETH_P_IP), gc: rt_garbage_collect, check: ipv4_dst_check, - reroute: ipv4_dst_reroute, destroy: ipv4_dst_destroy, negative_advice: ipv4_negative_advice, link_failure: ipv4_link_failure, + update_pmtu: ip_rt_update_pmtu, entry_size: sizeof(struct rtable), }; @@ -248,11 +248,12 @@ r->u.dst.__use, 0, (unsigned long)r->rt_src, - (r->u.dst.advmss ? - (int) r->u.dst.advmss + 40 : 0), - r->u.dst.window, - (int)((r->u.dst.rtt >> 3) + r->u.dst.rttvar), - r->key.tos, + (dst_metric(&r->u.dst, RTAX_ADVMSS) ? + (int) dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), + dst_metric(&r->u.dst, RTAX_WINDOW), + (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + + dst_metric(&r->u.dst, RTAX_RTTVAR)), + r->fl.fl4_tos, r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, @@ -337,7 +338,7 @@ /* Kill broadcast/multicast entries very aggresively, if they collide in hash table with more useful entries */ return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && - rth->key.iif && rth->u.rt_next; + rth->fl.iif && rth->u.rt_next; } static __inline__ int rt_valuable(struct rtable *rth) @@ -382,7 +383,7 @@ if (rt_valuable(rt)) score |= (1<<31); - if (!rt->key.iif || + if (!rt->fl.iif || !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) score |= (1<<30); @@ -647,6 +648,13 @@ out: return 0; } +static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) +{ + return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 && + fl1->oif == fl2->oif && + fl1->iif == fl2->iif; +} + static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) { struct rtable *rth, **rthp; @@ -667,7 +675,7 @@ write_lock_bh(&rt_hash_table[hash].lock); while ((rth = *rthp) != NULL) { - if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) { + if (compare_keys(&rth->fl, &rt->fl)) { /* Put it first */ *rthp = rth->u.rt_next; rth->u.rt_next = rt_hash_table[hash].chain; @@ -714,7 +722,7 @@ /* Try to bind route to arp only if it is output route or unicast forwarding path. */ - if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) { + if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->u.dst); if (err) { write_unlock_bh(&rt_hash_table[hash].lock); @@ -877,11 +885,11 @@ while ((rth = *rthp) != NULL) { struct rtable *rt; - if (rth->key.dst != daddr || - rth->key.src != skeys[i] || - rth->key.tos != tos || - rth->key.oif != ikeys[k] || - rth->key.iif != 0) { + if (rth->fl.fl4_dst != daddr || + rth->fl.fl4_src != skeys[i] || + rth->fl.fl4_tos != tos || + rth->fl.oif != ikeys[k] || + rth->fl.iif != 0) { rthp = &rth->u.rt_next; continue; } @@ -907,12 +915,15 @@ *rt = *rth; rt->u.dst.__use = 1; atomic_set(&rt->u.dst.__refcnt, 1); + rt->u.dst.child = NULL; if (rt->u.dst.dev) dev_hold(rt->u.dst.dev); + rt->u.dst.obsolete = 0; rt->u.dst.lastuse = jiffies; + rt->u.dst.path = &rt->u.dst; rt->u.dst.neighbour = NULL; rt->u.dst.hh = NULL; - rt->u.dst.obsolete = 0; + rt->u.dst.xfrm = NULL; rt->rt_flags |= RTCF_REDIRECTED; @@ -972,14 +983,14 @@ ret = NULL; } else if ((rt->rt_flags & RTCF_REDIRECTED) || rt->u.dst.expires) { - unsigned hash = rt_hash_code(rt->key.dst, - rt->key.src ^ - (rt->key.oif << 5), - rt->key.tos); + unsigned hash = rt_hash_code(rt->fl.fl4_dst, + rt->fl.fl4_src ^ + (rt->fl.oif << 5), + rt->fl.fl4_tos); #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ip_rt_advice: redirect to " "%u.%u.%u.%u/%02x dropped\n", - NIPQUAD(rt->rt_dst), rt->key.tos); + NIPQUAD(rt->rt_dst), rt->fl.fl4_tos); #endif rt_del(hash, rt); ret = NULL; @@ -1124,34 +1135,34 @@ read_lock(&rt_hash_table[hash].lock); for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) { - if (rth->key.dst == daddr && - rth->key.src == skeys[i] && + if (rth->fl.fl4_dst == daddr && + rth->fl.fl4_src == skeys[i] && rth->rt_dst == daddr && rth->rt_src == iph->saddr && - rth->key.tos == tos && - rth->key.iif == 0 && - !(rth->u.dst.mxlock & (1 << RTAX_MTU))) { + rth->fl.fl4_tos == tos && + rth->fl.iif == 0 && + !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) { unsigned short mtu = new_mtu; if (new_mtu < 68 || new_mtu >= old_mtu) { /* BSD 4.2 compatibility hack :-( */ if (mtu == 0 && - old_mtu >= rth->u.dst.pmtu && + old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] && old_mtu >= 68 + (iph->ihl << 2)) old_mtu -= iph->ihl << 2; mtu = guess_mtu(old_mtu); } - if (mtu <= rth->u.dst.pmtu) { - if (mtu < rth->u.dst.pmtu) { + if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) { + if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { dst_confirm(&rth->u.dst); if (mtu < ip_rt_min_pmtu) { mtu = ip_rt_min_pmtu; - rth->u.dst.mxlock |= + rth->u.dst.metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); } - rth->u.dst.pmtu = mtu; + rth->u.dst.metrics[RTAX_MTU-1] = mtu; dst_set_expires(&rth->u.dst, ip_rt_mtu_expires); } @@ -1164,15 +1175,15 @@ return est_mtu ? : new_mtu; } -void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu) +static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) { - if (dst->pmtu > mtu && mtu >= 68 && - !(dst->mxlock & (1 << RTAX_MTU))) { + if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 && + !(dst_metric_locked(dst, RTAX_MTU))) { if (mtu < ip_rt_min_pmtu) { mtu = ip_rt_min_pmtu; - dst->mxlock |= (1 << RTAX_MTU); + dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); } - dst->pmtu = mtu; + dst->metrics[RTAX_MTU-1] = mtu; dst_set_expires(dst, ip_rt_mtu_expires); } } @@ -1183,12 +1194,6 @@ return NULL; } -static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst, - struct sk_buff *skb) -{ - return NULL; -} - static void ipv4_dst_destroy(struct dst_entry *dst) { struct rtable *rt = (struct rtable *) dst; @@ -1234,9 +1239,9 @@ u32 src; struct fib_result res; - if (rt->key.iif == 0) + if (rt->fl.iif == 0) src = rt->rt_src; - else if (fib_lookup(&rt->key, &res) == 0) { + else if (fib_lookup(&rt->fl, &res) == 0) { #ifdef CONFIG_IP_ROUTE_NAT if (res.type == RTN_NAT) src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, @@ -1269,28 +1274,30 @@ if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); - memcpy(&rt->u.dst.mxlock, fi->fib_metrics, - sizeof(fi->fib_metrics)); + memcpy(rt->u.dst.metrics, fi->fib_metrics, + sizeof(rt->u.dst.metrics)); if (fi->fib_mtu == 0) { - rt->u.dst.pmtu = rt->u.dst.dev->mtu; - if (rt->u.dst.mxlock & (1 << RTAX_MTU) && + rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; + if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) && rt->rt_gateway != rt->rt_dst && - rt->u.dst.pmtu > 576) - rt->u.dst.pmtu = 576; + rt->u.dst.dev->mtu > 576) + rt->u.dst.metrics[RTAX_MTU-1] = 576; } #ifdef CONFIG_NET_CLS_ROUTE rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } else - rt->u.dst.pmtu = rt->u.dst.dev->mtu; + rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; - if (rt->u.dst.pmtu > IP_MAX_MTU) - rt->u.dst.pmtu = IP_MAX_MTU; - if (rt->u.dst.advmss == 0) - rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.dev->mtu - 40, + if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; + if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU) + rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; + if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0) + rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, ip_rt_min_advmss); - if (rt->u.dst.advmss > 65535 - 40) - rt->u.dst.advmss = 65535 - 40; + if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40) + rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; #ifdef CONFIG_NET_CLS_ROUTE #ifdef CONFIG_IP_MULTIPLE_TABLES @@ -1335,13 +1342,15 @@ atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; - rth->key.dst = daddr; + if (in_dev->cnf.no_policy) + rth->u.dst.flags |= DST_NOPOLICY; + rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; - rth->key.tos = tos; + rth->fl.fl4_tos = tos; #ifdef CONFIG_IP_ROUTE_FWMARK - rth->key.fwmark = skb->nfmark; + rth->fl.fl4_fwmark= skb->nfmark; #endif - rth->key.src = saddr; + rth->fl.fl4_src = saddr; rth->rt_src = saddr; #ifdef CONFIG_IP_ROUTE_NAT rth->rt_dst_map = daddr; @@ -1351,10 +1360,10 @@ rth->u.dst.tclassid = itag; #endif rth->rt_iif = - rth->key.iif = dev->ifindex; + rth->fl.iif = dev->ifindex; rth->u.dst.dev = &loopback_dev; dev_hold(rth->u.dst.dev); - rth->key.oif = 0; + rth->fl.oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->rt_type = RTN_MULTICAST; @@ -1396,10 +1405,19 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, u8 tos, struct net_device *dev) { - struct rt_key key; struct fib_result res; struct in_device *in_dev = in_dev_get(dev); struct in_device *out_dev = NULL; + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = saddr, + .tos = tos, + .scope = RT_SCOPE_UNIVERSE, +#ifdef CONFIG_IP_ROUTE_FWMARK + .fwmark = skb->nfmark +#endif + } }, + .iif = dev->ifindex }; unsigned flags = 0; u32 itag = 0; struct rtable * rth; @@ -1413,17 +1431,7 @@ if (!in_dev) goto out; - key.dst = daddr; - key.src = saddr; - key.tos = tos; -#ifdef CONFIG_IP_ROUTE_FWMARK - key.fwmark = skb->nfmark; -#endif - key.iif = dev->ifindex; - key.oif = 0; - key.scope = RT_SCOPE_UNIVERSE; - - hash = rt_hash_code(daddr, saddr ^ (key.iif << 5), tos); + hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos); /* Check for the most weird martians, which can be not detected by fib_lookup. @@ -1447,7 +1455,7 @@ /* * Now we are ready to route packet. */ - if ((err = fib_lookup(&key, &res)) != 0) { + if ((err = fib_lookup(&fl, &res)) != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_inval; goto no_route; @@ -1467,17 +1475,17 @@ src_map = fib_rules_policy(saddr, &res, &flags); if (res.type == RTN_NAT) { - key.dst = fib_rules_map_destination(daddr, &res); + fl.fl4_dst = fib_rules_map_destination(daddr, &res); fib_res_put(&res); free_res = 0; - if (fib_lookup(&key, &res)) + if (fib_lookup(&fl, &res)) goto e_inval; free_res = 1; if (res.type != RTN_UNICAST) goto e_inval; flags |= RTCF_DNAT; } - key.src = src_map; + fl.fl4_src = src_map; } #endif @@ -1503,8 +1511,8 @@ goto martian_destination; #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && key.oif == 0) - fib_select_multipath(&key, &res); + if (res.fi->fib_nhs > 1 && fl.oif == 0) + fib_select_multipath(&fl, &res); #endif out_dev = in_dev_get(FIB_RES_DEV(res)); if (out_dev == NULL) { @@ -1541,26 +1549,30 @@ atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; - rth->key.dst = daddr; + if (in_dev->cnf.no_policy) + rth->u.dst.flags |= DST_NOPOLICY; + if (in_dev->cnf.no_xfrm) + rth->u.dst.flags |= DST_NOXFRM; + rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; - rth->key.tos = tos; + rth->fl.fl4_tos = tos; #ifdef CONFIG_IP_ROUTE_FWMARK - rth->key.fwmark = skb->nfmark; + rth->fl.fl4_fwmark= skb->nfmark; #endif - rth->key.src = saddr; + rth->fl.fl4_src = saddr; rth->rt_src = saddr; rth->rt_gateway = daddr; #ifdef CONFIG_IP_ROUTE_NAT - rth->rt_src_map = key.src; - rth->rt_dst_map = key.dst; + rth->rt_src_map = fl.fl4_src; + rth->rt_dst_map = fl.fl4_dst; if (flags&RTCF_DNAT) - rth->rt_gateway = key.dst; + rth->rt_gateway = fl.fl4_dst; #endif rth->rt_iif = - rth->key.iif = dev->ifindex; + rth->fl.iif = dev->ifindex; rth->u.dst.dev = out_dev->dev; dev_hold(rth->u.dst.dev); - rth->key.oif = 0; + rth->fl.oif = 0; rth->rt_spec_dst= spec_dst; rth->u.dst.input = ip_forward; @@ -1618,26 +1630,27 @@ atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; - rth->key.dst = daddr; + if (in_dev->cnf.no_policy) + rth->u.dst.flags |= DST_NOPOLICY; + rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; - rth->key.tos = tos; + rth->fl.fl4_tos = tos; #ifdef CONFIG_IP_ROUTE_FWMARK - rth->key.fwmark = skb->nfmark; + rth->fl.fl4_fwmark= skb->nfmark; #endif - rth->key.src = saddr; + rth->fl.fl4_src = saddr; rth->rt_src = saddr; #ifdef CONFIG_IP_ROUTE_NAT - rth->rt_dst_map = key.dst; - rth->rt_src_map = key.src; + rth->rt_dst_map = fl.fl4_dst; + rth->rt_src_map = fl.fl4_src; #endif #ifdef CONFIG_NET_CLS_ROUTE rth->u.dst.tclassid = itag; #endif rth->rt_iif = - rth->key.iif = dev->ifindex; + rth->fl.iif = dev->ifindex; rth->u.dst.dev = &loopback_dev; dev_hold(rth->u.dst.dev); - rth->key.oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->u.dst.input= ip_local_deliver; @@ -1715,14 +1728,14 @@ read_lock(&rt_hash_table[hash].lock); for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) { - if (rth->key.dst == daddr && - rth->key.src == saddr && - rth->key.iif == iif && - rth->key.oif == 0 && + if (rth->fl.fl4_dst == daddr && + rth->fl.fl4_src == saddr && + rth->fl.iif == iif && + rth->fl.oif == 0 && #ifdef CONFIG_IP_ROUTE_FWMARK - rth->key.fwmark == skb->nfmark && + rth->fl.fl4_fwmark == skb->nfmark && #endif - rth->key.tos == tos) { + rth->fl.fl4_tos == tos) { rth->u.dst.lastuse = jiffies; dst_hold(&rth->u.dst); rth->u.dst.__use++; @@ -1772,43 +1785,45 @@ * Major route resolver routine. */ -int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey) +int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) { - struct rt_key key; + u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK); + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = oldflp->fl4_dst, + .saddr = oldflp->fl4_src, + .tos = tos & IPTOS_RT_MASK, + .scope = ((tos & RTO_ONLINK) ? + RT_SCOPE_LINK : + RT_SCOPE_UNIVERSE), +#ifdef CONFIG_IP_ROUTE_FWMARK + .fwmark = oldflp->fl4_fwmark +#endif + } }, + .iif = loopback_dev.ifindex, + .oif = oldflp->oif }; struct fib_result res; unsigned flags = 0; struct rtable *rth; struct net_device *dev_out = NULL; + struct in_device *in_dev = NULL; unsigned hash; int free_res = 0; int err; - u32 tos; - tos = oldkey->tos & (IPTOS_RT_MASK | RTO_ONLINK); - key.dst = oldkey->dst; - key.src = oldkey->src; - key.tos = tos & IPTOS_RT_MASK; - key.iif = loopback_dev.ifindex; - key.oif = oldkey->oif; -#ifdef CONFIG_IP_ROUTE_FWMARK - key.fwmark = oldkey->fwmark; -#endif - key.scope = (tos & RTO_ONLINK) ? RT_SCOPE_LINK : - RT_SCOPE_UNIVERSE; res.fi = NULL; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif - if (oldkey->src) { + if (oldflp->fl4_src) { err = -EINVAL; - if (MULTICAST(oldkey->src) || - BADCLASS(oldkey->src) || - ZERONET(oldkey->src)) + if (MULTICAST(oldflp->fl4_src) || + BADCLASS(oldflp->fl4_src) || + ZERONET(oldflp->fl4_src)) goto out; /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(oldkey->src); + dev_out = ip_dev_find(oldflp->fl4_src); if (dev_out == NULL) goto out; @@ -1820,8 +1835,8 @@ of another iface. --ANK */ - if (oldkey->oif == 0 - && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) { + if (oldflp->oif == 0 + && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) { /* Special hack: user can direct multicasts and limited broadcast via necessary interface without fiddling with IP_MULTICAST_IF or IP_PKTINFO. @@ -1837,15 +1852,15 @@ Luckily, this hack is good workaround. */ - key.oif = dev_out->ifindex; + fl.oif = dev_out->ifindex; goto make_route; } if (dev_out) dev_put(dev_out); dev_out = NULL; } - if (oldkey->oif) { - dev_out = dev_get_by_index(oldkey->oif); + if (oldflp->oif) { + dev_out = dev_get_by_index(oldflp->oif); err = -ENODEV; if (dev_out == NULL) goto out; @@ -1854,39 +1869,39 @@ goto out; /* Wrong error code */ } - if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) { - if (!key.src) - key.src = inet_select_addr(dev_out, 0, - RT_SCOPE_LINK); + if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) { + if (!fl.fl4_src) + fl.fl4_src = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); goto make_route; } - if (!key.src) { - if (MULTICAST(oldkey->dst)) - key.src = inet_select_addr(dev_out, 0, - key.scope); - else if (!oldkey->dst) - key.src = inet_select_addr(dev_out, 0, - RT_SCOPE_HOST); + if (!fl.fl4_src) { + if (MULTICAST(oldflp->fl4_dst)) + fl.fl4_src = inet_select_addr(dev_out, 0, + fl.fl4_scope); + else if (!oldflp->fl4_dst) + fl.fl4_src = inet_select_addr(dev_out, 0, + RT_SCOPE_HOST); } } - if (!key.dst) { - key.dst = key.src; - if (!key.dst) - key.dst = key.src = htonl(INADDR_LOOPBACK); + if (!fl.fl4_dst) { + fl.fl4_dst = fl.fl4_src; + if (!fl.fl4_dst) + fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); if (dev_out) dev_put(dev_out); dev_out = &loopback_dev; dev_hold(dev_out); - key.oif = loopback_dev.ifindex; + fl.oif = loopback_dev.ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - if (fib_lookup(&key, &res)) { + if (fib_lookup(&fl, &res)) { res.fi = NULL; - if (oldkey->oif) { + if (oldflp->oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -1905,9 +1920,9 @@ likely IPv6, but we do not. */ - if (key.src == 0) - key.src = inet_select_addr(dev_out, 0, - RT_SCOPE_LINK); + if (fl.fl4_src == 0) + fl.fl4_src = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); res.type = RTN_UNICAST; goto make_route; } @@ -1922,13 +1937,13 @@ goto e_inval; if (res.type == RTN_LOCAL) { - if (!key.src) - key.src = key.dst; + if (!fl.fl4_src) + fl.fl4_src = fl.fl4_dst; if (dev_out) dev_put(dev_out); dev_out = &loopback_dev; dev_hold(dev_out); - key.oif = dev_out->ifindex; + fl.oif = dev_out->ifindex; if (res.fi) fib_info_put(res.fi); res.fi = NULL; @@ -1937,36 +1952,40 @@ } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && key.oif == 0) - fib_select_multipath(&key, &res); + if (res.fi->fib_nhs > 1 && fl.oif == 0) + fib_select_multipath(&fl, &res); else #endif - if (!res.prefixlen && res.type == RTN_UNICAST && !key.oif) - fib_select_default(&key, &res); + if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) + fib_select_default(&fl, &res); - if (!key.src) - key.src = FIB_RES_PREFSRC(res); + if (!fl.fl4_src) + fl.fl4_src = FIB_RES_PREFSRC(res); if (dev_out) dev_put(dev_out); dev_out = FIB_RES_DEV(res); dev_hold(dev_out); - key.oif = dev_out->ifindex; + fl.oif = dev_out->ifindex; make_route: - if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK)) + if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) goto e_inval; - if (key.dst == 0xFFFFFFFF) + if (fl.fl4_dst == 0xFFFFFFFF) res.type = RTN_BROADCAST; - else if (MULTICAST(key.dst)) + else if (MULTICAST(fl.fl4_dst)) res.type = RTN_MULTICAST; - else if (BADCLASS(key.dst) || ZERONET(key.dst)) + else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst)) goto e_inval; if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; + in_dev = in_dev_get(dev_out); + if (!in_dev) + goto e_inval; + if (res.type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; if (res.fi) { @@ -1975,11 +1994,8 @@ } } else if (res.type == RTN_MULTICAST) { flags |= RTCF_MULTICAST|RTCF_LOCAL; - read_lock(&inetdev_lock); - if (!__in_dev_get(dev_out) || - !ip_check_mc(__in_dev_get(dev_out),oldkey->dst,oldkey->src)) + if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src)) flags &= ~RTCF_LOCAL; - read_unlock(&inetdev_lock); /* If multicast route do not exist use default one, but do not gateway in this case. Yes, it is hack. @@ -1996,25 +2012,28 @@ atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; - rth->key.dst = oldkey->dst; - rth->key.tos = tos; - rth->key.src = oldkey->src; - rth->key.iif = 0; - rth->key.oif = oldkey->oif; + if (in_dev->cnf.no_xfrm) + rth->u.dst.flags |= DST_NOXFRM; + if (in_dev->cnf.no_policy) + rth->u.dst.flags |= DST_NOPOLICY; + rth->fl.fl4_dst = oldflp->fl4_dst; + rth->fl.fl4_tos = tos; + rth->fl.fl4_src = oldflp->fl4_src; + rth->fl.oif = oldflp->oif; #ifdef CONFIG_IP_ROUTE_FWMARK - rth->key.fwmark = oldkey->fwmark; + rth->fl.fl4_fwmark= oldflp->fl4_fwmark; #endif - rth->rt_dst = key.dst; - rth->rt_src = key.src; + rth->rt_dst = fl.fl4_dst; + rth->rt_src = fl.fl4_src; #ifdef CONFIG_IP_ROUTE_NAT - rth->rt_dst_map = key.dst; - rth->rt_src_map = key.src; + rth->rt_dst_map = fl.fl4_dst; + rth->rt_src_map = fl.fl4_src; #endif - rth->rt_iif = oldkey->oif ? : dev_out->ifindex; + rth->rt_iif = oldflp->oif ? : dev_out->ifindex; rth->u.dst.dev = dev_out; dev_hold(dev_out); - rth->rt_gateway = key.dst; - rth->rt_spec_dst= key.src; + rth->rt_gateway = fl.fl4_dst; + rth->rt_spec_dst= fl.fl4_src; rth->u.dst.output=ip_output; @@ -2022,40 +2041,39 @@ if (flags & RTCF_LOCAL) { rth->u.dst.input = ip_local_deliver; - rth->rt_spec_dst = key.dst; + rth->rt_spec_dst = fl.fl4_dst; } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - rth->rt_spec_dst = key.src; + rth->rt_spec_dst = fl.fl4_src; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->u.dst.output = ip_mc_output; rt_cache_stat[smp_processor_id()].out_slow_mc++; } #ifdef CONFIG_IP_MROUTE if (res.type == RTN_MULTICAST) { - struct in_device *in_dev = in_dev_get(dev_out); - if (in_dev) { - if (IN_DEV_MFORWARD(in_dev) && - !LOCAL_MCAST(oldkey->dst)) { - rth->u.dst.input = ip_mr_input; - rth->u.dst.output = ip_mc_output; - } - in_dev_put(in_dev); + if (IN_DEV_MFORWARD(in_dev) && + !LOCAL_MCAST(oldflp->fl4_dst)) { + rth->u.dst.input = ip_mr_input; + rth->u.dst.output = ip_mc_output; } } #endif } rt_set_nexthop(rth, &res, 0); + rth->rt_flags = flags; - hash = rt_hash_code(oldkey->dst, oldkey->src ^ (oldkey->oif << 5), tos); + hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos); err = rt_intern_hash(hash, rth, rp); done: if (free_res) fib_res_put(&res); if (dev_out) dev_put(dev_out); + if (in_dev) + in_dev_put(in_dev); out: return err; e_inval: @@ -2066,23 +2084,23 @@ goto done; } -int ip_route_output_key(struct rtable **rp, const struct rt_key *key) +int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) { unsigned hash; struct rtable *rth; - hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos); + hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos); read_lock_bh(&rt_hash_table[hash].lock); for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) { - if (rth->key.dst == key->dst && - rth->key.src == key->src && - rth->key.iif == 0 && - rth->key.oif == key->oif && + if (rth->fl.fl4_dst == flp->fl4_dst && + rth->fl.fl4_src == flp->fl4_src && + rth->fl.iif == 0 && + rth->fl.oif == flp->oif && #ifdef CONFIG_IP_ROUTE_FWMARK - rth->key.fwmark == key->fwmark && + rth->fl.fl4_fwmark == flp->fl4_fwmark && #endif - !((rth->key.tos ^ key->tos) & + !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK))) { rth->u.dst.lastuse = jiffies; dst_hold(&rth->u.dst); @@ -2096,8 +2114,26 @@ } read_unlock_bh(&rt_hash_table[hash].lock); - return ip_route_output_slow(rp, key); -} + return ip_route_output_slow(rp, flp); +} + +int ip_route_output_key(struct rtable **rp, struct flowi *flp) +{ + int err; + + if ((err = __ip_route_output_key(rp, flp)) != 0) + return err; + return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, NULL, 0) : 0; +} + +int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) +{ + int err; + + if ((err = __ip_route_output_key(rp, flp)) != 0) + return err; + return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, sk, flags) : 0; +} static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait) @@ -2116,7 +2152,7 @@ r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = rt->key.tos; + r->rtm_tos = rt->fl.fl4_tos; r->rtm_table = RT_TABLE_MAIN; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; @@ -2125,9 +2161,9 @@ if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); - if (rt->key.src) { + if (rt->fl.fl4_src) { r->rtm_src_len = 32; - RTA_PUT(skb, RTA_SRC, 4, &rt->key.src); + RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src); } if (rt->u.dst.dev) RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); @@ -2135,13 +2171,13 @@ if (rt->u.dst.tclassid) RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid); #endif - if (rt->key.iif) + if (rt->fl.iif) RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); - else if (rt->rt_src != rt->key.src) + else if (rt->rt_src != rt->fl.fl4_src) RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src); if (rt->rt_dst != rt->rt_gateway) RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); - if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0) + if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) goto rtattr_failure; ci.rta_lastuse = jiffies - rt->u.dst.lastuse; ci.rta_used = rt->u.dst.__use; @@ -2163,7 +2199,7 @@ eptr = (struct rtattr*)skb->tail; #endif RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); - if (rt->key.iif) { + if (rt->fl.iif) { #ifdef CONFIG_IP_MROUTE u32 dst = rt->rt_dst; @@ -2183,7 +2219,7 @@ } } else #endif - RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif); + RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif); } nlh->nlmsg_len = skb->tail - b; @@ -2237,10 +2273,14 @@ if (!err && rt->u.dst.error) err = -rt->u.dst.error; } else { + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst, + .saddr = src, + .tos = rtm->rtm_tos } } }; int oif = 0; if (rta[RTA_OIF - 1]) memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int)); - err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif); + fl.oif = oif; + err = ip_route_output_key(&rt, &fl); } if (err) goto out_free; @@ -2629,4 +2669,8 @@ #ifdef CONFIG_NET_CLS_ROUTE create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL); #endif +#ifdef CONFIG_XFRM + xfrm_init(); + xfrm4_init(); +#endif } Index: net/ipv4/syncookies.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/syncookies.c,v retrieving revision 1.1.1.21 retrieving revision 1.1.1.21.2.1 diff -u -r1.1.1.21 -r1.1.1.21.2.1 --- a/net/ipv4/syncookies.c 3 Aug 2002 00:39:46 -0000 1.1.1.21 +++ b/net/ipv4/syncookies.c 16 Apr 2004 13:16:22 -0000 1.1.1.21.2.1 @@ -169,18 +169,25 @@ * hasn't changed since we received the original syn, but I see * no easy way to do this. */ - if (ip_route_output(&rt, - opt && - opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, - req->af.v4_req.loc_addr, - RT_CONN_FLAGS(sk), - 0)) { - tcp_openreq_free(req); - goto out; + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = ((opt && opt->srr) ? + opt->faddr : + req->af.v4_req.rmt_addr), + .saddr = req->af.v4_req.loc_addr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = IPPROTO_TCP, + .uli_u = { .ports = + { .sport = skb->h.th->dest, + .dport = skb->h.th->source } } }; + if (ip_route_output_key(&rt, &fl)) { + tcp_openreq_free(req); + goto out; + } } /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = rt->u.dst.window; + req->window_clamp = dst_metric(&rt->u.dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, 0, &rcv_wscale); Index: net/ipv4/sysctl_net_ipv4.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/sysctl_net_ipv4.c,v retrieving revision 1.1.1.20 retrieving revision 1.1.1.20.2.1 diff -u -r1.1.1.20 -r1.1.1.20.2.1 --- a/net/ipv4/sysctl_net_ipv4.c 14 Apr 2004 13:05:41 -0000 1.1.1.20 +++ b/net/ipv4/sysctl_net_ipv4.c 16 Apr 2004 13:16:22 -0000 1.1.1.20.2.1 @@ -82,14 +82,39 @@ void *newval, size_t newlen, void **context) { + int *valp = table->data; int new; + + if (!newval || !newlen) + return 0; + if (newlen != sizeof(int)) return -EINVAL; - if (get_user(new,(int *)newval)) - return -EFAULT; - if (new != ipv4_devconf.forwarding) - inet_forward_change(new); - return 0; /* caller does change again and handles handles oldval */ + + if (get_user(new, (int *)newval)) + return -EFAULT; + + if (new == *valp) + return 0; + + if (oldval && oldlenp) { + size_t len; + + if (get_user(len, oldlenp)) + return -EFAULT; + + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, valp, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + inet_forward_change(new); + return 1; } ctl_table ipv4_table[] = { @@ -110,7 +135,7 @@ &ipv4_sysctl_forward,&ipv4_sysctl_forward_strategy}, {NET_IPV4_DEFAULT_TTL, "ip_default_ttl", &sysctl_ip_default_ttl, sizeof(int), 0644, NULL, - &proc_dointvec}, + &ipv4_doint_and_flush, &ipv4_doint_and_flush_strategy}, {NET_IPV4_AUTOCONFIG, "ip_autoconfig", &ipv4_config.autoconfig, sizeof(int), 0644, NULL, &proc_dointvec}, Index: net/ipv4/tcp.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/tcp.c,v retrieving revision 1.1.1.28 retrieving revision 1.1.1.28.2.1 diff -u -r1.1.1.28 -r1.1.1.28.2.1 --- a/net/ipv4/tcp.c 18 Feb 2004 13:36:32 -0000 1.1.1.28 +++ b/net/ipv4/tcp.c 16 Apr 2004 13:16:22 -0000 1.1.1.28.2.1 @@ -204,6 +204,8 @@ * Andi Kleen : Make poll agree with SIGIO * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and * lingertime == 0 (RFC 793 ABORT Call) + * Hirokazu Takahashi : Use copy_from_user() instead of + * csum_and_copy_from_user() if possible. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -256,6 +258,7 @@ #include #include +#include #include #include @@ -953,8 +956,8 @@ return res; } -#define TCP_PAGE(sk) (sk->tp_pinfo.af_tcp.sndmsg_page) -#define TCP_OFF(sk) (sk->tp_pinfo.af_tcp.sndmsg_off) +#define TCP_PAGE(sk) (inet_sk(sk)->sndmsg_page) +#define TCP_OFF(sk) (inet_sk(sk)->sndmsg_off) static inline int tcp_copy_to_page(struct sock *sk, char *from, struct sk_buff *skb, @@ -963,18 +966,22 @@ int err = 0; unsigned int csum; - csum = csum_and_copy_from_user(from, page_address(page)+off, + if (skb->ip_summed == CHECKSUM_NONE) { + csum = csum_and_copy_from_user(from, page_address(page) + off, copy, 0, &err); - if (!err) { - if (skb->ip_summed == CHECKSUM_NONE) - skb->csum = csum_block_add(skb->csum, csum, skb->len); - skb->len += copy; - skb->data_len += copy; - skb->truesize += copy; - sk->wmem_queued += copy; - sk->forward_alloc -= copy; + if (err) return err; + skb->csum = csum_block_add(skb->csum, csum, skb->len); + } else { + if (copy_from_user(page_address(page) + off, from, copy)) + return -EFAULT; } - return err; + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk->wmem_queued += copy; + sk->forward_alloc -= copy; + return 0; } static inline int @@ -984,11 +991,16 @@ unsigned int csum; int off = skb->len; - csum = csum_and_copy_from_user(from, skb_put(skb, copy), + if (skb->ip_summed == CHECKSUM_NONE) { + csum = csum_and_copy_from_user(from, skb_put(skb, copy), copy, 0, &err); - if (!err) { - skb->csum = csum_block_add(skb->csum, csum, off); - return 0; + if (!err) { + skb->csum = csum_block_add(skb->csum, csum, off); + return 0; + } + } else { + if (!copy_from_user(skb_put(skb, copy), from, copy)) + return 0; } __skb_trim(skb, off); @@ -1070,6 +1082,12 @@ if (skb == NULL) goto wait_for_memory; + /* + * Check whether we can use HW checksum. + */ + if (sk->route_caps & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)) + skb->ip_summed = CHECKSUM_HW; + skb_entail(sk, tp, skb); copy = mss_now; } @@ -1890,6 +1908,8 @@ tcp_kill_sk_queues(sk); + xfrm_sk_free_policy(sk); + #ifdef INET_REFCNT_DEBUG if (atomic_read(&sk->refcnt) != 1) { printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt)); Index: net/ipv4/tcp_input.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/tcp_input.c,v retrieving revision 1.1.1.32 retrieving revision 1.1.1.32.2.1 diff -u -r1.1.1.32 -r1.1.1.32.2.1 --- a/net/ipv4/tcp_input.c 14 Apr 2004 13:05:41 -0000 1.1.1.32 +++ b/net/ipv4/tcp_input.c 16 Apr 2004 13:16:22 -0000 1.1.1.32.2.1 @@ -529,25 +529,25 @@ * Probably, no packets returned in time. * Reset our results. */ - if (!(dst->mxlock&(1<rtt = 0; + if (!(dst_metric_locked(dst, RTAX_RTT))) + dst->metrics[RTAX_RTT-1] = 0; return; } - m = dst->rtt - tp->srtt; + m = dst_metric(dst, RTAX_RTT) - tp->srtt; /* If newly calculated rtt larger than stored one, * store new one. Otherwise, use EWMA. Remember, * rtt overestimation is always better than underestimation. */ - if (!(dst->mxlock&(1<rtt = tp->srtt; + dst->metrics[RTAX_RTT-1] = tp->srtt; else - dst->rtt -= (m>>3); + dst->metrics[RTAX_RTT-1] -= (m>>3); } - if (!(dst->mxlock&(1<mdev) m = tp->mdev; - if (m >= dst->rttvar) - dst->rttvar = m; + if (m >= dst_metric(dst, RTAX_RTTVAR)) + dst->metrics[RTAX_RTTVAR-1] = m; else - dst->rttvar -= (dst->rttvar - m)>>2; + dst->metrics[RTAX_RTTVAR-1] -= + (dst->metrics[RTAX_RTTVAR-1] - m)>>2; } if (tp->snd_ssthresh >= 0xFFFF) { /* Slow start still did not finish. */ - if (dst->ssthresh && - !(dst->mxlock&(1<snd_cwnd>>1) > dst->ssthresh) - dst->ssthresh = (tp->snd_cwnd>>1); - if (!(dst->mxlock&(1<snd_cwnd > dst->cwnd) - dst->cwnd = tp->snd_cwnd; + if (dst_metric(dst, RTAX_SSTHRESH) && + !dst_metric_locked(dst, RTAX_SSTHRESH) && + (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) + dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; + if (!dst_metric_locked(dst, RTAX_CWND) && + tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) + dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; } else if (tp->snd_cwnd > tp->snd_ssthresh && tp->ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ - if (!(dst->mxlock&(1<ssthresh = max(tp->snd_cwnd>>1, tp->snd_ssthresh); - if (!(dst->mxlock&(1<cwnd = (dst->cwnd + tp->snd_cwnd)>>1; + if (!dst_metric_locked(dst, RTAX_SSTHRESH)) + dst->metrics[RTAX_SSTHRESH-1] = + max(tp->snd_cwnd >> 1, tp->snd_ssthresh); + if (!dst_metric_locked(dst, RTAX_CWND)) + dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1; } else { /* Else slow start did not finish, cwnd is non-sense, ssthresh may be also invalid. */ - if (!(dst->mxlock&(1<cwnd = (dst->cwnd + tp->snd_ssthresh)>>1; - if (dst->ssthresh && - !(dst->mxlock&(1<snd_ssthresh > dst->ssthresh) - dst->ssthresh = tp->snd_ssthresh; + if (!dst_metric_locked(dst, RTAX_CWND)) + dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1; + if (dst->metrics[RTAX_SSTHRESH-1] && + !dst_metric_locked(dst, RTAX_SSTHRESH) && + tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1]) + dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; } - if (!(dst->mxlock&(1<reordering < tp->reordering && + if (!dst_metric_locked(dst, RTAX_REORDERING)) { + if (dst->metrics[RTAX_REORDERING-1] < tp->reordering && tp->reordering != sysctl_tcp_reordering) - dst->reordering = tp->reordering; + dst->metrics[RTAX_REORDERING-1] = tp->reordering; } } } -/* Increase initial CWND conservatively: if estimated - * RTT is low enough (<20msec) or if we have some preset ssthresh. - * - * Numbers are taken from RFC2414. - */ -__u32 tcp_init_cwnd(struct tcp_opt *tp) +/* Numbers are taken from RFC2414. */ +__u32 tcp_init_cwnd(struct tcp_opt *tp, struct dst_entry *dst) { - __u32 cwnd; - - if (tp->mss_cache > 1460) - return 2; - - cwnd = (tp->mss_cache > 1095) ? 3 : 4; - - if (!tp->srtt || (tp->snd_ssthresh >= 0xFFFF && tp->srtt > ((HZ/50)<<3))) - cwnd = 2; - else if (cwnd > tp->snd_ssthresh) - cwnd = tp->snd_ssthresh; + __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); + if (!cwnd) { + if (tp->mss_cache > 1460) + cwnd = 2; + else + cwnd = (tp->mss_cache > 1095) ? 3 : 4; + } return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } @@ -632,22 +626,23 @@ dst_confirm(dst); - if (dst->mxlock&(1<snd_cwnd_clamp = dst->cwnd; - if (dst->ssthresh) { - tp->snd_ssthresh = dst->ssthresh; + if (dst_metric_locked(dst, RTAX_CWND)) + tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); + if (dst_metric(dst, RTAX_SSTHRESH)) { + tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; } - if (dst->reordering && tp->reordering != dst->reordering) { + if (dst_metric(dst, RTAX_REORDERING) && + tp->reordering != dst_metric(dst, RTAX_REORDERING)) { tp->sack_ok &= ~2; - tp->reordering = dst->reordering; + tp->reordering = dst_metric(dst, RTAX_REORDERING); } - if (dst->rtt == 0) + if (dst_metric(dst, RTAX_RTT) == 0) goto reset; - if (!tp->srtt && dst->rtt < (TCP_TIMEOUT_INIT<<3)) + if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) goto reset; /* Initial rtt is determined from SYN,SYN-ACK. @@ -664,17 +659,17 @@ * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles. */ - if (dst->rtt > tp->srtt) - tp->srtt = dst->rtt; - if (dst->rttvar > tp->mdev) { - tp->mdev = dst->rttvar; + if (dst_metric(dst, RTAX_RTT) > tp->srtt) + tp->srtt = dst_metric(dst, RTAX_RTT); + if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) { + tp->mdev = dst_metric(dst, RTAX_RTTVAR); tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); } tcp_set_rto(tp); tcp_bound_rto(tp); if (tp->rto < TCP_TIMEOUT_INIT && !tp->saw_tstamp) goto reset; - tp->snd_cwnd = tcp_init_cwnd(tp); + tp->snd_cwnd = tcp_init_cwnd(tp, dst); tp->snd_cwnd_stamp = tcp_time_stamp; return; @@ -3923,7 +3918,24 @@ tcp_sync_mss(sk, tp->pmtu_cookie); tcp_initialize_rcv_mss(sk); + + /* Remember, tcp_poll() does not lock socket! + * Change state from SYN-SENT only after copied_seq + * is initialized. */ + tp->copied_seq = tp->rcv_nxt; + mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + + /* Make sure socket is routed, for correct metrics. */ + tp->af_specific->rebuild_header(sk); + tcp_init_metrics(sk); + + /* Prevent spurious tcp_cwnd_restart() on first data + * packet. + */ + tp->lsndtime = tcp_time_stamp; + tcp_init_buffer_space(sk); if (sk->keepopen) @@ -3934,13 +3946,6 @@ else tp->pred_flags = 0; - /* Remember, tcp_poll() does not lock socket! - * Change state from SYN-SENT only after copied_seq - * is initialized. */ - tp->copied_seq = tp->rcv_nxt; - mb(); - tcp_set_state(sk, TCP_ESTABLISHED); - if(!sk->dead) { sk->state_change(sk); sk_wake_async(sk, 0, POLL_OUT); @@ -4186,7 +4191,18 @@ if (tp->tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + /* Make sure socket is routed, for + * correct metrics. + */ + tp->af_specific->rebuild_header(sk); + tcp_init_metrics(sk); + + /* Prevent spurious tcp_cwnd_restart() on + * first data packet. + */ + tp->lsndtime = tcp_time_stamp; + tcp_initialize_rcv_mss(sk); tcp_init_buffer_space(sk); tcp_fast_path_on(tp); Index: net/ipv4/tcp_ipv4.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/tcp_ipv4.c,v retrieving revision 1.1.1.29 retrieving revision 1.1.1.29.2.1 diff -u -r1.1.1.29 -r1.1.1.29.2.1 --- a/net/ipv4/tcp_ipv4.c 14 Apr 2004 13:05:41 -0000 1.1.1.29 +++ b/net/ipv4/tcp_ipv4.c 16 Apr 2004 13:16:22 -0000 1.1.1.29.2.1 @@ -63,13 +63,12 @@ #include #include #include +#include #include #include -#include extern int sysctl_ip_dynaddr; -extern int sysctl_ip_default_ttl; int sysctl_tcp_tw_reuse = 0; int sysctl_tcp_low_latency = 0; @@ -785,7 +784,9 @@ } tmp = ip_route_connect(&rt, nexthop, sk->saddr, - RT_CONN_FLAGS(sk), sk->bound_dev_if); + RT_CONN_FLAGS(sk), sk->bound_dev_if, + IPPROTO_TCP, + sk->sport, usin->sin_port, sk); if (tmp < 0) return tmp; @@ -794,9 +795,6 @@ return -ENETUNREACH; } - __sk_dst_set(sk, &rt->u.dst); - sk->route_caps = rt->u.dst.dev->features; - if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr) daddr = rt->rt_dst; @@ -846,6 +844,15 @@ if (err) goto failure; + err = ip_route_newports(&rt, sk->sport, sk->dport, sk); + if (err) + goto failure; + + /* OK, now commit destination to socket. */ + __sk_dst_set(sk, &rt->u.dst); + sk->route_caps = rt->u.dst.dev->features; + tp->ext2_header_len = rt->u.dst.header_len; + if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, sk->sport, usin->sin_port); @@ -853,14 +860,16 @@ sk->protinfo.af_inet.id = tp->write_seq^jiffies; err = tcp_connect(sk); + rt = NULL; if (err) goto failure; return 0; failure: + /* This unhashes the socket and releases the local port, if necessary. */ tcp_set_state(sk, TCP_CLOSE); - __sk_dst_reset(sk); + ip_rt_put(rt); sk->route_caps = 0; sk->dport = 0; return err; @@ -922,7 +931,7 @@ /* * This routine does path mtu discovery as defined in RFC1191. */ -static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu) +static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, u32 mtu) { struct dst_entry *dst; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; @@ -943,17 +952,19 @@ if ((dst = __sk_dst_check(sk, 0)) == NULL) return; - ip_rt_update_pmtu(dst, mtu); + dst->ops->update_pmtu(dst, mtu); /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ - if (mtu < dst->pmtu && ip_dont_fragment(sk, dst)) + if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst)) sk->err_soft = EMSGSIZE; + mtu = dst_pmtu(dst); + if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT && - tp->pmtu_cookie > dst->pmtu) { - tcp_sync_mss(sk, dst->pmtu); + tp->pmtu_cookie > mtu) { + tcp_sync_mss(sk, mtu); /* Resend the TCP packet because it's * clear that the old packet has been @@ -1191,10 +1202,8 @@ sizeof(struct tcphdr), IPPROTO_TCP, 0); - arg.n_iov = 1; arg.csumoffset = offsetof(struct tcphdr, check) / 2; - tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl; ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth); TCP_INC_STATS_BH(TcpOutSegs); @@ -1219,7 +1228,6 @@ arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); - arg.n_iov = 1; if (ts) { rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -1270,14 +1278,20 @@ static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req) { struct rtable *rt; - struct ip_options *opt; + struct ip_options *opt = req->af.v4_req.opt; + struct flowi fl = { .oif = sk->bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = ((opt && opt->srr) ? + opt->faddr : + req->af.v4_req.rmt_addr), + .saddr = req->af.v4_req.loc_addr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = IPPROTO_TCP, + .uli_u = { .ports = + { .sport = sk->sport, + .dport = req->rmt_port } } }; - opt = req->af.v4_req.opt; - if(ip_route_output(&rt, ((opt && opt->srr) ? - opt->faddr : - req->af.v4_req.rmt_addr), - req->af.v4_req.loc_addr, - RT_CONN_FLAGS(sk), sk->bound_dev_if)) { + if (ip_route_output_flow(&rt, &fl, sk, 0)) { IP_INC_STATS_BH(IpOutNoRoutes); return NULL; } @@ -1500,7 +1514,7 @@ (sysctl_max_syn_backlog - tcp_synq_len(sk) < (sysctl_max_syn_backlog>>2)) && (!peer || !peer->tcp_ts_stamp) && - (!dst || !dst->rtt)) { + (!dst || !dst_metric(dst, RTAX_RTT))) { /* Without syncookies last quarter of * backlog is filled with destinations, proven to be alive. * It means that we continue to communicate @@ -1572,10 +1586,11 @@ newtp->ext_header_len = 0; if (newsk->protinfo.af_inet.opt) newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen; + newtp->ext2_header_len = dst->header_len; newsk->protinfo.af_inet.id = newtp->write_seq^jiffies; - tcp_sync_mss(newsk, dst->pmtu); - newtp->advmss = dst->advmss; + tcp_sync_mss(newsk, dst_pmtu(dst)); + newtp->advmss = dst_metric(dst, RTAX_ADVMSS);; tcp_initialize_rcv_mss(newsk); __tcp_v4_hash(newsk, 0); @@ -1760,12 +1775,12 @@ goto no_tcp_socket; process: - if(!ipsec_sk_policy(sk,skb)) - goto discard_and_relse; - if (sk->state == TCP_TIME_WAIT) goto do_time_wait; + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + if (sk_filter(sk, skb, 0)) goto discard_and_relse; @@ -1785,6 +1800,9 @@ return ret; no_tcp_socket: + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { bad_packet: TCP_INC_STATS_BH(TcpInErrs); @@ -1802,6 +1820,9 @@ goto discard_it; do_time_wait: + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TcpInErrs); tcp_tw_put((struct tcp_tw_bucket *) sk); @@ -1856,12 +1877,15 @@ /* Query new route. */ err = ip_route_connect(&rt, daddr, 0, RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute, - sk->bound_dev_if); + sk->bound_dev_if, + IPPROTO_TCP, + sk->sport, sk->dport, sk); if (err) return err; __sk_dst_set(sk, &rt->u.dst); sk->route_caps = rt->u.dst.dev->features; + tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len; new_saddr = rt->rt_src; @@ -1904,11 +1928,23 @@ if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) daddr = sk->protinfo.af_inet.opt->faddr; - err = ip_route_output(&rt, daddr, sk->saddr, - RT_CONN_FLAGS(sk), sk->bound_dev_if); + { + struct flowi fl = { .oif = sk->bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = sk->saddr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = IPPROTO_TCP, + .uli_u = { .ports = + { .sport = sk->sport, + .dport = sk->dport } } }; + + err = ip_route_output_flow(&rt, &fl, sk, 0); + } if (!err) { __sk_dst_set(sk, &rt->u.dst); sk->route_caps = rt->u.dst.dev->features; + tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len; return 0; } @@ -2070,8 +2106,8 @@ tcp_put_port(sk); /* If sendmsg cached page exists, toss it. */ - if (tp->sndmsg_page != NULL) - __free_page(tp->sndmsg_page); + if (inet_sk(sk)->sndmsg_page) + __free_page(inet_sk(sk)->sndmsg_page); atomic_dec(&tcp_sockets_allocated); @@ -2329,7 +2365,7 @@ if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0) panic("Failed to create the TCP control socket.\n"); tcp_socket->sk->allocation=GFP_ATOMIC; - tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL; + tcp_socket->sk->protinfo.af_inet.uc_ttl = -1; /* Unhash it so that IP input processing does not even * see it, we do not wish this socket to see incoming Index: net/ipv4/tcp_minisocks.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/tcp_minisocks.c,v retrieving revision 1.1.1.22 retrieving revision 1.1.1.22.2.1 diff -u -r1.1.1.22 -r1.1.1.22.2.1 --- a/net/ipv4/tcp_minisocks.c 25 Aug 2003 11:44:44 -0000 1.1.1.22 +++ b/net/ipv4/tcp_minisocks.c 16 Apr 2004 13:16:23 -0000 1.1.1.22.2.1 @@ -25,6 +25,7 @@ #include #include #include +#include #ifdef CONFIG_SYSCTL #define SYNC_INIT 0 /* let the user enable it */ @@ -683,6 +684,13 @@ if ((filter = newsk->filter) != NULL) sk_filter_charge(newsk, filter); #endif + if (unlikely(xfrm_sk_clone_policy(newsk))) { + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->destruct = NULL; + sk_free(newsk); + return NULL; + } /* Now setup tcp_opt */ newtp = &(newsk->tp_pinfo.af_tcp); Index: net/ipv4/tcp_output.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/tcp_output.c,v retrieving revision 1.1.1.27 retrieving revision 1.1.1.27.2.1 diff -u -r1.1.1.27 -r1.1.1.27.2.1 --- a/net/ipv4/tcp_output.c 28 Nov 2003 18:26:21 -0000 1.1.1.27 +++ b/net/ipv4/tcp_output.c 16 Apr 2004 13:16:23 -0000 1.1.1.27.2.1 @@ -89,8 +89,8 @@ struct dst_entry *dst = __sk_dst_get(sk); int mss = tp->advmss; - if (dst && dst->advmss < mss) { - mss = dst->advmss; + if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { + mss = dst_metric(dst, RTAX_ADVMSS); tp->advmss = mss; } @@ -99,10 +99,10 @@ /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct tcp_opt *tp) +static void tcp_cwnd_restart(struct tcp_opt *tp, struct dst_entry *dst) { s32 delta = tcp_time_stamp - tp->lsndtime; - u32 restart_cwnd = tcp_init_cwnd(tp); + u32 restart_cwnd = tcp_init_cwnd(tp, dst); u32 cwnd = tp->snd_cwnd; tp->snd_ssthresh = tcp_current_ssthresh(tp); @@ -115,12 +115,12 @@ tp->snd_cwnd_used = 0; } -static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb) +static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb, struct sock *sk) { u32 now = tcp_time_stamp; if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) - tcp_cwnd_restart(tp); + tcp_cwnd_restart(tp, __sk_dst_get(sk)); tp->lsndtime = now; @@ -271,7 +271,7 @@ tcp_event_ack_sent(sk); if (skb->len != tcp_header_size) - tcp_event_data_sent(tp, skb); + tcp_event_data_sent(tp, skb, sk); TCP_INC_STATS(TcpOutSegs); @@ -502,13 +502,16 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) { - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct tcp_opt *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); int mss_now; + if (dst && dst->ops->get_mss) + pmtu = dst->ops->get_mss(dst, pmtu); + /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ - mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); /* Clamp it (mss_clamp does not include tcp options) */ @@ -516,7 +519,7 @@ mss_now = tp->mss_clamp; /* Now subtract optional transport overhead */ - mss_now -= tp->ext_header_len; + mss_now -= tp->ext_header_len + tp->ext2_header_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) @@ -1131,10 +1134,10 @@ if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ __u8 rcv_wscale; /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : dst->window; + req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), - dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rcv_wnd, &req->window_clamp, req->wscale_ok, @@ -1146,7 +1149,7 @@ th->window = htons(req->rcv_wnd); TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok, + tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok, req->sack_ok, req->wscale_ok, req->rcv_wscale, TCP_SKB_CB(skb)->when, req->ts_recent); @@ -1175,11 +1178,11 @@ if (tp->user_mss) tp->mss_clamp = tp->user_mss; tp->max_window = 0; - tcp_sync_mss(sk, dst->pmtu); + tcp_sync_mss(sk, dst_pmtu(dst)); if (!tp->window_clamp) - tp->window_clamp = dst->window; - tp->advmss = dst->advmss; + tp->window_clamp = dst_metric(dst, RTAX_WINDOW); + tp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(sk); tcp_select_initial_window(tcp_full_space(sk), Index: net/ipv4/udp.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/udp.c,v retrieving revision 1.1.1.25 retrieving revision 1.1.1.25.2.1 diff -u -r1.1.1.25 -r1.1.1.25.2.1 --- a/net/ipv4/udp.c 14 Apr 2004 13:05:41 -0000 1.1.1.25 +++ b/net/ipv4/udp.c 16 Apr 2004 13:16:23 -0000 1.1.1.25.2.1 @@ -11,6 +11,7 @@ * Fred N. van Kempen, * Arnt Gulbrandsen, * Alan Cox, + * Hirokazu Takahashi, * * Fixes: * Alan Cox : verify_area() calls @@ -64,6 +65,10 @@ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind * a single port at the same time. + * Hirokazu Takahashi : HW checksumming for outgoing UDP + * datagrams. + * Hirokazu Takahashi : sendfile() on UDP works now. + * Derek Atkins : Add Encapulation Support * * * This program is free software; you can redistribute it and/or @@ -97,6 +102,7 @@ #include #include #include +#include /* * Snmp MIB for the UDP layer @@ -371,80 +377,119 @@ sock_put(sk); } - -static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base) -{ - return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base)); -} - -struct udpfakehdr -{ - struct udphdr uh; - u32 saddr; - u32 daddr; - struct iovec *iov; - u32 wcheck; -}; - /* - * Copy and checksum a UDP packet from user space into a buffer. + * Throw away all pending data and cancel the corking. Socket is locked. */ - -static int udp_getfrag(const void *p, char * to, unsigned int offset, unsigned int fraglen) +static void udp_flush_pending_frames(struct sock *sk) { - struct udpfakehdr *ufh = (struct udpfakehdr *)p; - if (offset==0) { - if (csum_partial_copy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset, - fraglen-sizeof(struct udphdr), &ufh->wcheck)) - return -EFAULT; - ufh->wcheck = csum_partial((char *)ufh, sizeof(struct udphdr), - ufh->wcheck); - ufh->uh.check = csum_tcpudp_magic(ufh->saddr, ufh->daddr, - ntohs(ufh->uh.len), - IPPROTO_UDP, ufh->wcheck); - if (ufh->uh.check == 0) - ufh->uh.check = -1; - memcpy(to, ufh, sizeof(struct udphdr)); - return 0; + struct udp_opt *up = udp_sk(sk); + + if (up->pending) { + up->len = 0; + up->pending = 0; + ip_flush_pending_frames(sk); } - if (csum_partial_copy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr), - fraglen, &ufh->wcheck)) - return -EFAULT; - return 0; } /* - * Copy a UDP packet from user space into a buffer without checksumming. + * Push out all pending data as one UDP datagram. Socket is locked. */ - -static int udp_getfrag_nosum(const void *p, char * to, unsigned int offset, unsigned int fraglen) +static int udp_push_pending_frames(struct sock *sk, struct udp_opt *up) { - struct udpfakehdr *ufh = (struct udpfakehdr *)p; + struct sk_buff *skb; + struct udphdr *uh; + int err = 0; + + /* Grab the skbuff where UDP header space exists. */ + if ((skb = skb_peek(&sk->write_queue)) == NULL) + goto out; + + /* + * Create a UDP header + */ + uh = skb->h.uh; + uh->source = up->sport; + uh->dest = up->dport; + uh->len = htons(up->len); + uh->check = 0; - if (offset==0) { - memcpy(to, ufh, sizeof(struct udphdr)); - return memcpy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset, - fraglen-sizeof(struct udphdr)); + if (sk->no_check == UDP_CSUM_NOXMIT) { + skb->ip_summed = CHECKSUM_NONE; + goto send; } - return memcpy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr), - fraglen); + + if (skb_queue_len(&sk->write_queue) == 1) { + /* + * Only one fragment on the socket. + */ + if (skb->ip_summed == CHECKSUM_HW) { + skb->csum = offsetof(struct udphdr, check); + uh->check = ~csum_tcpudp_magic(up->saddr, up->daddr, + up->len, IPPROTO_UDP, 0); + } else { + skb->csum = csum_partial((char *)uh, + sizeof(struct udphdr), skb->csum); + uh->check = csum_tcpudp_magic(up->saddr, up->daddr, + up->len, IPPROTO_UDP, skb->csum); + if (uh->check == 0) + uh->check = -1; + } + } else { + unsigned int csum = 0; + /* + * HW-checksum won't work as there are two or more + * fragments on the socket so that all csums of sk_buffs + * should be together. + */ + if (skb->ip_summed == CHECKSUM_HW) { + int offset = (unsigned char *)uh - skb->data; + skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + + skb->ip_summed = CHECKSUM_NONE; + } else { + skb->csum = csum_partial((char *)uh, + sizeof(struct udphdr), skb->csum); + } + + skb_queue_walk(&sk->write_queue, skb) { + csum = csum_add(csum, skb->csum); + } + uh->check = csum_tcpudp_magic(up->saddr, up->daddr, + up->len, IPPROTO_UDP, csum); + if (uh->check == 0) + uh->check = -1; + } +send: + err = ip_push_pending_frames(sk); +out: + up->len = 0; + up->pending = 0; + return err; +} + + +static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base) +{ + return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base)); } int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) { - int ulen = len + sizeof(struct udphdr); + struct udp_opt *up = udp_sk(sk); + int ulen = len; struct ipcm_cookie ipc; - struct udpfakehdr ufh; struct rtable *rt = NULL; int free = 0; int connected = 0; - u32 daddr; + u32 daddr, faddr, saddr; + u16 dport; u8 tos; int err; + int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; /* This check is ONLY to check for arithmetic overflow on integer(!) len. Not more! Real check will be made - in ip_build_xmit --ANK + in ip_append_* --ANK BTW socket.c -> af_*.c -> ... make multiple invalid conversions size_t -> int. We MUST repair it f.e. @@ -463,10 +508,23 @@ if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; + ipc.opt = NULL; + + if (up->pending) { + /* + * There are pending frames. + * The socket lock must be held while it's corked. + */ + lock_sock(sk); + if (likely(up->pending)) + goto do_append_data; + release_sock(sk); + } + ulen += sizeof(struct udphdr); + /* * Get and verify the address. */ - if (msg->msg_name) { struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name; if (msg->msg_namelen < sizeof(*usin)) @@ -476,24 +534,22 @@ return -EINVAL; } - ufh.daddr = usin->sin_addr.s_addr; - ufh.uh.dest = usin->sin_port; - if (ufh.uh.dest == 0) + daddr = usin->sin_addr.s_addr; + dport = usin->sin_port; + if (dport == 0) return -EINVAL; } else { if (sk->state != TCP_ESTABLISHED) return -EDESTADDRREQ; - ufh.daddr = sk->daddr; - ufh.uh.dest = sk->dport; + daddr = sk->daddr; + dport = sk->dport; /* Open fast path for connected socket. Route will not be used, if at least one option is set. */ connected = 1; } ipc.addr = sk->saddr; - ufh.uh.source = sk->sport; - ipc.opt = NULL; ipc.oif = sk->bound_dev_if; if (msg->msg_controllen) { err = ip_cmsg_send(msg, &ipc); @@ -506,13 +562,13 @@ if (!ipc.opt) ipc.opt = sk->protinfo.af_inet.opt; - ufh.saddr = ipc.addr; - ipc.addr = daddr = ufh.daddr; + saddr = ipc.addr; + ipc.addr = faddr = daddr; if (ipc.opt && ipc.opt->srr) { if (!daddr) return -EINVAL; - daddr = ipc.opt->faddr; + faddr = ipc.opt->faddr; connected = 0; } tos = RT_TOS(sk->protinfo.af_inet.tos); @@ -525,8 +581,8 @@ if (MULTICAST(daddr)) { if (!ipc.oif) ipc.oif = sk->protinfo.af_inet.mc_index; - if (!ufh.saddr) - ufh.saddr = sk->protinfo.af_inet.mc_addr; + if (!saddr) + saddr = sk->protinfo.af_inet.mc_addr; connected = 0; } @@ -534,7 +590,16 @@ rt = (struct rtable*)sk_dst_check(sk, 0); if (rt == NULL) { - err = ip_route_output(&rt, daddr, ufh.saddr, tos, ipc.oif); + struct flowi fl = { .oif = ipc.oif, + .nl_u = { .ip4_u = + { .daddr = faddr, + .saddr = saddr, + .tos = tos } }, + .proto = IPPROTO_UDP, + .uli_u = { .ports = + { .sport = sk->sport, + .dport = dport } } }; + err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); if (err) goto out; @@ -549,23 +614,39 @@ goto do_confirm; back_from_confirm: - ufh.saddr = rt->rt_src; + saddr = rt->rt_src; if (!ipc.addr) - ufh.daddr = ipc.addr = rt->rt_dst; - ufh.uh.len = htons(ulen); - ufh.uh.check = 0; - ufh.iov = msg->msg_iov; - ufh.wcheck = 0; - - /* RFC1122: OK. Provides the checksumming facility (MUST) as per */ - /* 4.1.3.4. It's configurable by the application via setsockopt() */ - /* (MAY) and it defaults to on (MUST). */ - - err = ip_build_xmit(sk, - (sk->no_check == UDP_CSUM_NOXMIT ? - udp_getfrag_nosum : - udp_getfrag), - &ufh, ulen, &ipc, rt, msg->msg_flags); + daddr = ipc.addr = rt->rt_dst; + + lock_sock(sk); + if (unlikely(up->pending)) { + /* The socket is already corked while preparing it. */ + /* ... which is an evident application bug. --ANK */ + release_sock(sk); + + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n")); + err = -EINVAL; + goto out; + } + /* + * Now cork the socket to pend data. + */ + up->daddr = daddr; + up->dport = dport; + up->saddr = saddr; + up->sport = sk->sport; + up->pending = 1; + +do_append_data: + up->len += ulen; + err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen, + sizeof(struct udphdr), &ipc, rt, + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); + if (err) + udp_flush_pending_frames(sk); + else if (!corkreq) + err = udp_push_pending_frames(sk, up); + release_sock(sk); out: ip_rt_put(rt); @@ -585,6 +666,52 @@ goto out; } +int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) +{ + struct udp_opt *up = udp_sk(sk); + int ret; + + if (!up->pending) { + struct msghdr msg = { .msg_flags = flags|MSG_MORE }; + + /* Call udp_sendmsg to specify destination address which + * sendpage interface can't pass. + * This will succeed only when the socket is connected. + */ + ret = udp_sendmsg(sk, &msg, 0); + if (ret < 0) + return ret; + } + + lock_sock(sk); + + if (unlikely(!up->pending)) { + release_sock(sk); + + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n")); + return -EINVAL; + } + + ret = ip_append_page(sk, page, offset, size, flags); + if (ret == -EOPNOTSUPP) { + release_sock(sk); + return sock_no_sendpage(sk->socket, page, offset, size, flags); + } + if (ret < 0) { + udp_flush_pending_frames(sk); + goto out; + } + + up->len += size; + if (!(up->corkflag || (flags&MSG_MORE))) + ret = udp_push_pending_frames(sk, up); + if (!ret) + ret = size; +out: + release_sock(sk); + return ret; +} + /* * IOCTL requests applicable to the UDP protocol */ @@ -754,7 +881,9 @@ saddr = sk->protinfo.af_inet.mc_addr; } err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, - RT_CONN_FLAGS(sk), oif); + RT_CONN_FLAGS(sk), oif, + IPPROTO_UDP, + sk->sport, usin->sin_port, sk); if (err) return err; if ((rt->rt_flags&RTCF_BROADCAST) && !sk->broadcast) { @@ -805,11 +934,129 @@ inet_sock_release(sk); } +/* return: + * 1 if the the UDP system should process it + * 0 if we should drop this packet + * -1 if it should get processed by xfrm4_rcv_encap + */ +static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb) +{ +#ifndef CONFIG_XFRM + return 1; +#else + struct udp_opt *up = udp_sk(sk); + struct udphdr *uh = skb->h.uh; + struct iphdr *iph; + int iphlen, len; + + __u8 *udpdata = (__u8 *)uh + sizeof(struct udphdr); + __u32 *udpdata32 = (__u32 *)udpdata; + __u16 encap_type = up->encap_type; + + /* if we're overly short, let UDP handle it */ + if (udpdata > skb->tail) + return 1; + + /* if this is not encapsulated socket, then just return now */ + if (!encap_type) + return 1; + + len = skb->tail - udpdata; + + switch (encap_type) { + case UDP_ENCAP_ESPINUDP: + /* Check if this is a keepalive packet. If so, eat it. */ + if (len == 1 && udpdata[0] == 0xff) { + return 0; + } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0 ) { + /* ESP Packet without Non-ESP header */ + len = sizeof(struct udphdr); + } else + /* Must be an IKE packet.. pass it through */ + return 1; + + /* At this point we are sure that this is an ESPinUDP packet, + * so we need to remove 'len' bytes from the packet (the UDP + * header and optional ESP marker bytes) and then modify the + * protocol to ESP, and then call into the transform receiver. + */ + + /* Now we can update and verify the packet length... */ + iph = skb->nh.iph; + iphlen = iph->ihl << 2; + iph->tot_len = htons(ntohs(iph->tot_len) - len); + if (skb->len < iphlen + len) { + /* packet is too small!?! */ + return 0; + } + + /* pull the data buffer up to the ESP header and set the + * transport header to point to ESP. Keep UDP on the stack + * for later. + */ + skb->h.raw = skb_pull(skb, len); + + /* modify the protocol (it's ESP!) */ + iph->protocol = IPPROTO_ESP; + + /* and let the caller know to send this into the ESP processor... */ + return -1; + + default: + if (net_ratelimit()) + printk(KERN_INFO "udp_encap_rcv(): Unhandled UDP encap type: %u\n", + encap_type); + return 1; + } +#endif +} + +/* returns: + * -1: error + * 0: success + * >0: "udp encap" protocol resubmission + * + * Note that in the success and error cases, the skb is assumed to + * have either been requeued or freed. + */ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) { + struct udp_opt *up = udp_sk(sk); + /* * Charge it to the socket, dropping if the queue is full. */ + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return -1; + } + + if (up->encap_type) { + /* + * This is an encapsulation socket, so let's see if this is + * an encapsulated packet. + * If it's a keepalive packet, then just eat it. + * If it's an encapsulateed packet, then pass it to the + * IPsec xfrm input and return the response + * appropriately. Otherwise, just fall through and + * pass this up the UDP socket. + */ + int ret; + + ret = udp_encap_rcv(sk, skb); + if (ret == 0) { + /* Eat the packet .. */ + kfree_skb(skb); + return 0; + } + if (ret < 0) { + /* process the ESP packet */ + ret = xfrm4_rcv_encap(skb, up->encap_type); + UDP_INC_STATS_BH(UdpInDatagrams); + return -ret; + } + /* FALLTHROUGH -- it's a UDP Packet */ + } #if defined(CONFIG_FILTER) if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) { @@ -862,8 +1109,13 @@ if(sknext) skb1 = skb_clone(skb, GFP_ATOMIC); - if(skb1) - udp_queue_rcv_skb(sk, skb1); + if(skb1) { + int ret = udp_queue_rcv_skb(sk, skb1); + if (ret > 0) + /* we should probably re-process instead + * of dropping packets here. */ + kfree_skb(skb1); + } sk = sknext; } while(sknext); } else @@ -938,11 +1190,20 @@ sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); if (sk != NULL) { - udp_queue_rcv_skb(sk, skb); + int ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); + + /* a return value > 0 means to resubmit the input, but + * it it wants the return to be -protocol, or 0 + */ + if (ret > 0) + return -ret; return 0; } + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + /* No socket. Drop packet silently, if checksum is wrong */ if (udp_checksum_complete(skb)) goto csum_error; @@ -983,6 +1244,7 @@ NIPQUAD(daddr), ntohs(uh->dest), ulen)); +drop: UDP_INC_STATS_BH(UdpInErrors); kfree_skb(skb); return(0); @@ -1047,16 +1309,107 @@ return len; } +static int udp_destroy_sock(struct sock *sk) +{ + lock_sock(sk); + udp_flush_pending_frames(sk); + release_sock(sk); + return 0; +} + +/* + * Socket option code for UDP + */ +static int udp_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + struct udp_opt *up = udp_sk(sk); + int val; + int err = 0; + + if (level != SOL_UDP) + return ip_setsockopt(sk, level, optname, optval, optlen); + + if(optlencorkflag = 1; + } else { + up->corkflag = 0; + lock_sock(sk); + udp_push_pending_frames(sk, up); + release_sock(sk); + } + break; + + case UDP_ENCAP: + up->encap_type = val; + break; + + default: + err = -ENOPROTOOPT; + break; + }; + + return err; +} + +static int udp_getsockopt(struct sock *sk, int level, int optname, + char *optval, int *optlen) +{ + struct udp_opt *up = udp_sk(sk); + int val, len; + + if (level != SOL_UDP) + return ip_getsockopt(sk, level, optname, optval, optlen); + + if(get_user(len,optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + if(len < 0) + return -EINVAL; + + switch(optname) { + case UDP_CORK: + val = up->corkflag; + break; + + case UDP_ENCAP: + val = up->encap_type; + break; + + default: + return -ENOPROTOOPT; + }; + + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, &val,len)) + return -EFAULT; + return 0; +} + + struct proto udp_prot = { name: "UDP", close: udp_close, connect: udp_connect, disconnect: udp_disconnect, ioctl: udp_ioctl, - setsockopt: ip_setsockopt, - getsockopt: ip_getsockopt, + destroy: udp_destroy_sock, + setsockopt: udp_setsockopt, + getsockopt: udp_getsockopt, sendmsg: udp_sendmsg, recvmsg: udp_recvmsg, + sendpage: udp_sendpage, backlog_rcv: udp_queue_rcv_skb, hash: udp_v4_hash, unhash: udp_v4_unhash, Index: net/ipv4/xfrm4_input.c =================================================================== RCS file: net/ipv4/xfrm4_input.c diff -N net/ipv4/xfrm4_input.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv4/xfrm4_input.c 16 Apr 2004 13:16:23 -0000 1.6.14.1 @@ -0,0 +1,158 @@ +/* + * xfrm4_input.c + * + * Changes: + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * Derek Atkins + * Add Encapsulation support + * + */ + +#include +#include +#include +#include + +int xfrm4_rcv(struct sk_buff *skb) +{ + return xfrm4_rcv_encap(skb, 0); +} + +static inline void ipip_ecn_decapsulate(struct sk_buff *skb) +{ + struct iphdr *outer_iph = skb->nh.iph; + struct iphdr *inner_iph = skb->h.ipiph; + + if (INET_ECN_is_ce(outer_iph->tos) && + INET_ECN_is_not_ce(inner_iph->tos)) + IP_ECN_set_ce(inner_iph); +} + +static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) +{ + switch (nexthdr) { + case IPPROTO_IPIP: + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return -EINVAL; + *spi = skb->nh.iph->saddr; + *seq = 0; + return 0; + } + + return xfrm_parse_spi(skb, nexthdr, spi, seq); +} + +int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) +{ + int err; + u32 spi, seq; + struct sec_decap_state xfrm_vec[XFRM_MAX_DEPTH]; + struct xfrm_state *x; + int xfrm_nr = 0; + int decaps = 0; + int hhlen; + + hhlen = skb->nh.raw - skb->mac.raw; + + if ((err = xfrm4_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) != 0) + goto drop; + + do { + struct iphdr *iph = skb->nh.iph; + + if (xfrm_nr == XFRM_MAX_DEPTH) + goto drop_put; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, iph->protocol, AF_INET); + if (x == NULL) + goto drop_put; + + spin_lock(&x->lock); + if (unlikely(x->km.state != XFRM_STATE_VALID)) + goto drop_unlock; + + if (x->props.replay_window && xfrm_replay_check(x, seq)) + goto drop_unlock; + + if (xfrm_state_check_expire(x)) + goto drop_unlock; + + xfrm_vec[xfrm_nr].decap.decap_type = encap_type; + if (x->type->input(x, &(xfrm_vec[xfrm_nr].decap), skb)) + goto drop_unlock; + + /* only the first xfrm gets the encap type */ + encap_type = 0; + + if (x->props.replay_window) + xfrm_replay_advance(x, seq); + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock(&x->lock); + + xfrm_vec[xfrm_nr++].xvec = x; + + if (x->props.mode) { + if (skb->nh.iph->protocol != IPPROTO_IPIP) + goto drop_put; + decaps = 1; + break; + } + + if ((err = xfrm_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) < 0) + goto drop_put; + } while (!err); + + /* Allocate new secpath or COW existing one. */ + + if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { + struct sec_path *sp; + sp = secpath_dup(skb->sp); + if (!sp) + goto drop_put; + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + } + if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH) + goto drop_put; + + memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state)); + skb->sp->len += xfrm_nr; + + if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto drop; + + if (decaps) { + skb->mac.raw = memmove(skb->data - hhlen, skb->mac.raw, hhlen); + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto drop; + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip_ecn_decapsulate(skb); + skb->nh.raw = skb->data; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + if (!(skb->dev->flags&IFF_LOOPBACK)) { + dst_release(skb->dst); + skb->dst = NULL; + } + netif_rx(skb); + return 0; + } else { + skb->mac.raw = memmove(skb->nh.raw - hhlen, skb->mac.raw, + hhlen); + return -skb->nh.iph->protocol; + } + +drop_unlock: + spin_unlock(&x->lock); + xfrm_state_put(x); +drop_put: + while (--xfrm_nr >= 0) + xfrm_state_put(xfrm_vec[xfrm_nr].xvec); +drop: + kfree_skb(skb); + return 0; +} Index: net/ipv4/xfrm4_policy.c =================================================================== RCS file: net/ipv4/xfrm4_policy.c diff -N net/ipv4/xfrm4_policy.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv4/xfrm4_policy.c 16 Apr 2004 13:16:23 -0000 1.4.18.1 @@ -0,0 +1,278 @@ +/* + * xfrm4_policy.c + * + * Changes: + * Kazunori MIYAZAWA @USAGI + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include +#include +#include + +extern struct dst_ops xfrm4_dst_ops; +extern struct xfrm_policy_afinfo xfrm4_policy_afinfo; + +static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED }; + +static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) +{ + return __ip_route_output_key((struct rtable**)dst, fl); +} + +/* Check that the bundle accepts the flow and its components are + * still valid. + */ + +static int __xfrm4_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl) +{ + do { + if (xdst->u.dst.ops != &xfrm4_dst_ops) + return 1; + + if (!xfrm_selector_match(&xdst->u.dst.xfrm->sel, fl, AF_INET)) + return 0; + if (xdst->u.dst.xfrm->km.state != XFRM_STATE_VALID || + xdst->u.dst.path->obsolete > 0) + return 0; + xdst = (struct xfrm_dst*)xdst->u.dst.child; + } while (xdst); + return 0; +} + +static struct dst_entry * +__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy) +{ + struct dst_entry *dst; + + read_lock_bh(&policy->lock); + for (dst = policy->bundles; dst; dst = dst->next) { + struct xfrm_dst *xdst = (struct xfrm_dst*)dst; + if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/ + xdst->u.rt.fl.fl4_dst == fl->fl4_dst && + xdst->u.rt.fl.fl4_src == fl->fl4_src && + __xfrm4_bundle_ok(xdst, fl)) { + dst_clone(dst); + break; + } + } + read_unlock_bh(&policy->lock); + return dst; +} + +/* Allocate chain of dst_entry's, attach known xfrm's, calculate + * all the metrics... Shortly, bundle a bundle. + */ + +static int +__xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, + struct flowi *fl, struct dst_entry **dst_p) +{ + struct dst_entry *dst, *dst_prev; + struct rtable *rt0 = (struct rtable*)(*dst_p); + struct rtable *rt = rt0; + u32 remote = fl->fl4_dst; + u32 local = fl->fl4_src; + int i; + int err; + int header_len = 0; + int trailer_len = 0; + + dst = dst_prev = NULL; + + for (i = 0; i < nx; i++) { + struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops); + + if (unlikely(dst1 == NULL)) { + err = -ENOBUFS; + goto error; + } + + dst1->xfrm = xfrm[i]; + if (!dst) + dst = dst1; + else { + dst_prev->child = dst1; + dst1->flags |= DST_NOHASH; + dst_clone(dst1); + } + dst_prev = dst1; + if (xfrm[i]->props.mode) { + remote = xfrm[i]->id.daddr.a4; + local = xfrm[i]->props.saddr.a4; + } + header_len += xfrm[i]->props.header_len; + trailer_len += xfrm[i]->props.trailer_len; + } + + if (remote != fl->fl4_dst) { + struct flowi fl_tunnel = { .nl_u = { .ip4_u = + { .daddr = remote, + .saddr = local } + } + }; + err = xfrm_dst_lookup((struct xfrm_dst**)&rt, &fl_tunnel, AF_INET); + if (err) + goto error; + } else { + dst_hold(&rt->u.dst); + } + dst_prev->child = &rt->u.dst; + for (dst_prev = dst; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { + struct xfrm_dst *x = (struct xfrm_dst*)dst_prev; + x->u.rt.fl = *fl; + + dst_prev->dev = rt->u.dst.dev; + if (rt->u.dst.dev) + dev_hold(rt->u.dst.dev); + dst_prev->obsolete = -1; + dst_prev->flags |= DST_HOST; + dst_prev->lastuse = jiffies; + dst_prev->header_len = header_len; + dst_prev->trailer_len = trailer_len; + memcpy(&dst_prev->metrics, &rt->u.dst.metrics, sizeof(dst_prev->metrics)); + dst_prev->path = &rt->u.dst; + + /* Copy neighbout for reachability confirmation */ + dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour); + dst_prev->input = rt->u.dst.input; + dst_prev->output = dst_prev->xfrm->type->output; + if (rt->peer) + atomic_inc(&rt->peer->refcnt); + x->u.rt.peer = rt->peer; + /* Sheit... I remember I did this right. Apparently, + * it was magically lost, so this code needs audit */ + x->u.rt.rt_flags = rt0->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL); + x->u.rt.rt_type = rt->rt_type; + x->u.rt.rt_src = rt0->rt_src; + x->u.rt.rt_dst = rt0->rt_dst; + x->u.rt.rt_gateway = rt->rt_gateway; + x->u.rt.rt_spec_dst = rt0->rt_spec_dst; + header_len -= x->u.dst.xfrm->props.header_len; + trailer_len -= x->u.dst.xfrm->props.trailer_len; + } + *dst_p = dst; + return 0; + +error: + if (dst) + dst_free(dst); + return err; +} + +static void +_decode_session4(struct sk_buff *skb, struct flowi *fl) +{ + struct iphdr *iph = skb->nh.iph; + u8 *xprth = skb->nh.raw + iph->ihl*4; + + memset(fl, 0, sizeof(struct flowi)); + if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { + switch (iph->protocol) { + case IPPROTO_UDP: + case IPPROTO_TCP: + case IPPROTO_SCTP: + if (pskb_may_pull(skb, xprth + 4 - skb->data)) { + u16 *ports = (u16 *)xprth; + + fl->fl_ip_sport = ports[0]; + fl->fl_ip_dport = ports[1]; + } + break; + + case IPPROTO_ESP: + if (pskb_may_pull(skb, xprth + 4 - skb->data)) { + u32 *ehdr = (u32 *)xprth; + + fl->fl_ipsec_spi = ehdr[0]; + } + break; + + case IPPROTO_AH: + if (pskb_may_pull(skb, xprth + 8 - skb->data)) { + u32 *ah_hdr = (u32*)xprth; + + fl->fl_ipsec_spi = ah_hdr[1]; + } + break; + + case IPPROTO_COMP: + if (pskb_may_pull(skb, xprth + 4 - skb->data)) { + u16 *ipcomp_hdr = (u16 *)xprth; + + fl->fl_ipsec_spi = ntohl(ntohs(ipcomp_hdr[1])); + } + break; + default: + fl->fl_ipsec_spi = 0; + break; + }; + } + fl->proto = iph->protocol; + fl->fl4_dst = iph->daddr; + fl->fl4_src = iph->saddr; +} + +static inline int xfrm4_garbage_collect(void) +{ + read_lock(&xfrm4_policy_afinfo.lock); + xfrm4_policy_afinfo.garbage_collect(); + read_unlock(&xfrm4_policy_afinfo.lock); + return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); +} + +static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct dst_entry *path = dst->path; + + if (mtu < 68 + dst->header_len) + return; + + path->ops->update_pmtu(path, mtu); +} + +struct dst_ops xfrm4_dst_ops = { + .family = AF_INET, + .protocol = __constant_htons(ETH_P_IP), + .gc = xfrm4_garbage_collect, + .update_pmtu = xfrm4_update_pmtu, + .gc_thresh = 1024, + .entry_size = sizeof(struct xfrm_dst), +}; + +struct xfrm_policy_afinfo xfrm4_policy_afinfo = { + .family = AF_INET, + .lock = RW_LOCK_UNLOCKED, + .type_map = &xfrm4_type_map, + .dst_ops = &xfrm4_dst_ops, + .dst_lookup = xfrm4_dst_lookup, + .find_bundle = __xfrm4_find_bundle, + .bundle_create = __xfrm4_bundle_create, + .decode_session = _decode_session4, +}; + +void __init xfrm4_policy_init(void) +{ + xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); +} + +void __exit xfrm4_policy_fini(void) +{ + xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); +} + +void __init xfrm4_init(void) +{ + xfrm4_state_init(); + xfrm4_policy_init(); +} + +void __exit xfrm4_fini(void) +{ + //xfrm4_input_fini(); + xfrm4_policy_fini(); + xfrm4_state_fini(); +} + Index: net/ipv4/xfrm4_state.c =================================================================== RCS file: net/ipv4/xfrm4_state.c diff -N net/ipv4/xfrm4_state.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv4/xfrm4_state.c 16 Apr 2004 13:16:23 -0000 1.3.18.1 @@ -0,0 +1,128 @@ +/* + * xfrm4_state.c + * + * Changes: + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include +#include +#include + +extern struct xfrm_state_afinfo xfrm4_state_afinfo; + +static void +__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, + struct xfrm_tmpl *tmpl, + xfrm_address_t *daddr, xfrm_address_t *saddr) +{ + x->sel.daddr.a4 = fl->fl4_dst; + x->sel.saddr.a4 = fl->fl4_src; + x->sel.dport = fl->fl_ip_dport; + x->sel.dport_mask = ~0; + x->sel.sport = fl->fl_ip_sport; + x->sel.sport_mask = ~0; + x->sel.prefixlen_d = 32; + x->sel.prefixlen_s = 32; + x->sel.proto = fl->proto; + x->sel.ifindex = fl->oif; + x->id = tmpl->id; + if (x->id.daddr.a4 == 0) + x->id.daddr.a4 = daddr->a4; + x->props.saddr = tmpl->saddr; + if (x->props.saddr.a4 == 0) + x->props.saddr.a4 = saddr->a4; + x->props.mode = tmpl->mode; + x->props.reqid = tmpl->reqid; + x->props.family = AF_INET; +} + +static struct xfrm_state * +__xfrm4_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto) +{ + unsigned h = __xfrm4_spi_hash(daddr, spi, proto); + struct xfrm_state *x; + + list_for_each_entry(x, xfrm4_state_afinfo.state_byspi+h, byspi) { + if (x->props.family == AF_INET && + spi == x->id.spi && + daddr->a4 == x->id.daddr.a4 && + proto == x->id.proto) { + xfrm_state_hold(x); + return x; + } + } + return NULL; +} + +static struct xfrm_state * +__xfrm4_find_acq(u8 mode, u32 reqid, u8 proto, + xfrm_address_t *daddr, xfrm_address_t *saddr, + int create) +{ + struct xfrm_state *x, *x0; + unsigned h = __xfrm4_dst_hash(daddr); + + x0 = NULL; + + list_for_each_entry(x, xfrm4_state_afinfo.state_bydst+h, bydst) { + if (x->props.family == AF_INET && + daddr->a4 == x->id.daddr.a4 && + mode == x->props.mode && + proto == x->id.proto && + saddr->a4 == x->props.saddr.a4 && + reqid == x->props.reqid && + x->km.state == XFRM_STATE_ACQ) { + if (!x0) + x0 = x; + if (x->id.spi) + continue; + x0 = x; + break; + } + } + if (x0) { + xfrm_state_hold(x0); + } else if (create && (x0 = xfrm_state_alloc()) != NULL) { + x0->sel.daddr.a4 = daddr->a4; + x0->sel.saddr.a4 = saddr->a4; + x0->sel.prefixlen_d = 32; + x0->sel.prefixlen_s = 32; + x0->props.saddr.a4 = saddr->a4; + x0->km.state = XFRM_STATE_ACQ; + x0->id.daddr.a4 = daddr->a4; + x0->id.proto = proto; + x0->props.family = AF_INET; + x0->props.mode = mode; + x0->props.reqid = reqid; + x0->props.family = AF_INET; + x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES; + xfrm_state_hold(x0); + mod_timer(&x0->timer, jiffies + XFRM_ACQ_EXPIRES*HZ); + xfrm_state_hold(x0); + list_add_tail(&x0->bydst, xfrm4_state_afinfo.state_bydst+h); + wake_up(&km_waitq); + } + return x0; +} + +static struct xfrm_state_afinfo xfrm4_state_afinfo = { + .family = AF_INET, + .lock = RW_LOCK_UNLOCKED, + .init_tempsel = __xfrm4_init_tempsel, + .state_lookup = __xfrm4_state_lookup, + .find_acq = __xfrm4_find_acq, +}; + +void __init xfrm4_state_init(void) +{ + xfrm_state_register_afinfo(&xfrm4_state_afinfo); +} + +void __exit xfrm4_state_fini(void) +{ + xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); +} + Index: net/ipv4/xfrm4_tunnel.c =================================================================== RCS file: net/ipv4/xfrm4_tunnel.c diff -N net/ipv4/xfrm4_tunnel.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv4/xfrm4_tunnel.c 16 Apr 2004 13:16:23 -0000 1.5.14.1 @@ -0,0 +1,204 @@ +/* xfrm4_tunnel.c: Generic IP tunnel transformer. + * + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + */ + +#include +#include +#include +#include +#include + +int xfrm4_tunnel_check_size(struct sk_buff *skb) +{ + int mtu, ret = 0; + struct dst_entry *dst; + struct iphdr *iph = skb->nh.iph; + + if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) + goto out; + + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; + + if (!(iph->frag_off & htons(IP_DF))) + goto out; + + dst = skb->dst; + mtu = dst_pmtu(dst) - dst->header_len - dst->trailer_len; + if (skb->len > mtu) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ret = -EMSGSIZE; + } +out: + return ret; +} + +static int ipip_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct iphdr *iph, *top_iph; + int tos, err; + + if ((err = xfrm4_tunnel_check_size(skb)) != 0) + goto error_nolock; + + iph = skb->nh.iph; + + spin_lock_bh(&x->lock); + + tos = iph->tos; + + top_iph = (struct iphdr *) skb_push(skb, x->props.header_len); + top_iph->ihl = 5; + top_iph->version = 4; + top_iph->tos = INET_ECN_encapsulate(tos, iph->tos); + top_iph->tot_len = htons(skb->len); + top_iph->frag_off = iph->frag_off & ~htons(IP_MF|IP_OFFSET); + if (!(iph->frag_off & htons(IP_DF))) { +#ifdef NETIF_F_TSO + __ip_select_ident(top_iph, dst, 0); +#else + __ip_select_ident(top_iph, dst); +#endif + } + top_iph->ttl = iph->ttl; + top_iph->protocol = IPPROTO_IPIP; + top_iph->check = 0; + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + ip_send_check(top_iph); + + skb->nh.raw = skb->data; + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock_bh(&x->lock); + + if ((skb->dst = dst_pop(dst)) == NULL) { + kfree_skb(skb); + err = -EHOSTUNREACH; + goto error_nolock; + } + return NET_XMIT_BYPASS; + +error_nolock: + kfree_skb(skb); + return err; +} + +static int ipip_xfrm_rcv(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + return 0; +} + +static struct xfrm_tunnel *ipip_handler; +static DECLARE_MUTEX(xfrm4_tunnel_sem); + +int xfrm4_tunnel_register(struct xfrm_tunnel *handler) +{ + int ret; + + down(&xfrm4_tunnel_sem); + ret = 0; + if (ipip_handler != NULL) + ret = -EINVAL; + if (!ret) + ipip_handler = handler; + up(&xfrm4_tunnel_sem); + + return ret; +} + +int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler) +{ + int ret; + + down(&xfrm4_tunnel_sem); + ret = 0; + if (ipip_handler != handler) + ret = -EINVAL; + if (!ret) + ipip_handler = NULL; + up(&xfrm4_tunnel_sem); + + synchronize_net(); + + return ret; +} + +static int ipip_rcv(struct sk_buff *skb) +{ + struct xfrm_tunnel *handler = ipip_handler; + + /* Tunnel devices take precedence. */ + if (handler && handler->handler(skb) == 0) + return 0; + + return xfrm4_rcv_encap(skb, 0); +} + +static void ipip_err(struct sk_buff *skb, u32 info) +{ + struct xfrm_tunnel *handler = ipip_handler; + u32 arg = info; + + if (handler) + handler->err_handler(skb, &arg); +} + +static int ipip_init_state(struct xfrm_state *x, void *args) +{ + if (!x->props.mode) + return -EINVAL; + x->props.header_len = sizeof(struct iphdr); + + return 0; +} + +static void ipip_destroy(struct xfrm_state *x) +{ +} + +static struct xfrm_type ipip_type = { + .description = "IPIP", + .proto = IPPROTO_IPIP, + .init_state = ipip_init_state, + .destructor = ipip_destroy, + .input = ipip_xfrm_rcv, + .output = ipip_output +}; + +static struct inet_protocol ipip_protocol = { + .handler = ipip_rcv, + .err_handler = ipip_err, + .no_policy = 1, +}; + +static int __init ipip_init(void) +{ + SET_MODULE_OWNER(&ipip_type); + if (xfrm_register_type(&ipip_type, AF_INET) < 0) { + printk(KERN_INFO "ipip init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&ipip_protocol, IPPROTO_IPIP) < 0) { + printk(KERN_INFO "ipip init: can't add protocol\n"); + xfrm_unregister_type(&ipip_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit ipip_fini(void) +{ + if (inet_del_protocol(&ipip_protocol, IPPROTO_IPIP) < 0) + printk(KERN_INFO "ipip close: can't remove protocol\n"); + if (xfrm_unregister_type(&ipip_type, AF_INET) < 0) + printk(KERN_INFO "ipip close: can't remove xfrm type\n"); +} + +module_init(ipip_init); +module_exit(ipip_fini); +MODULE_LICENSE("GPL"); Index: net/ipv4/ipvs/ip_vs_conn.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/ipvs/ip_vs_conn.c,v retrieving revision 1.1.1.7 retrieving revision 1.1.1.7.2.1 diff -u -r1.1.1.7 -r1.1.1.7.2.1 --- a/net/ipv4/ipvs/ip_vs_conn.c 14 Apr 2004 13:05:41 -0000 1.1.1.7 +++ b/net/ipv4/ipvs/ip_vs_conn.c 16 Apr 2004 13:16:23 -0000 1.1.1.7.2.1 @@ -606,17 +606,25 @@ struct iphdr *iph = skb->nh.iph; u8 tos = iph->tos; int mtu; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = 0, + .tos = RT_TOS(tos), } }, + }; EnterFunction(10); - if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(tos), 0)) { + if (ip_route_output_key(&rt, &fl)) { IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); goto tx_error_icmp; } /* MTU checking */ - mtu = rt->u.dst.pmtu; + mtu = dst_pmtu(&rt->u.dst); if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); @@ -642,8 +650,7 @@ #ifdef CONFIG_NETFILTER_DEBUG skb->nf_debug = 1 << NF_IP_LOCAL_OUT; #endif /* CONFIG_NETFILTER_DEBUG */ - skb->nfcache |= NFC_IPVS_PROPERTY; - ip_send(skb); + IP_VS_XMIT(skb, rt); LeaveFunction(10); return NF_STOLEN; @@ -742,7 +749,7 @@ goto tx_error_icmp; /* MTU checking */ - mtu = rt->u.dst.pmtu; + mtu = dst_pmtu(&rt->u.dst); if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); @@ -814,8 +821,7 @@ #ifdef CONFIG_NETFILTER_DEBUG skb->nf_debug = 1 << NF_IP_LOCAL_OUT; #endif /* CONFIG_NETFILTER_DEBUG */ - skb->nfcache |= NFC_IPVS_PROPERTY; - ip_send(skb); + IP_VS_XMIT(skb, rt); LeaveFunction(10); return NF_STOLEN; @@ -870,14 +876,14 @@ tdev = rt->u.dst.dev; - mtu = rt->u.dst.pmtu - sizeof(struct iphdr); + mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr); if (mtu < 68) { ip_rt_put(rt); IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); goto tx_error; } - if (skb->dst && mtu < skb->dst->pmtu) - skb->dst->pmtu = mtu; + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); df |= (old_iph->frag_off&__constant_htons(IP_DF)); @@ -939,8 +945,7 @@ #ifdef CONFIG_NETFILTER_DEBUG skb->nf_debug = 1 << NF_IP_LOCAL_OUT; #endif /* CONFIG_NETFILTER_DEBUG */ - skb->nfcache |= NFC_IPVS_PROPERTY; - ip_send(skb); + IP_VS_XMIT(skb, rt); LeaveFunction(10); @@ -969,7 +974,7 @@ goto tx_error_icmp; /* MTU checking */ - mtu = rt->u.dst.pmtu; + mtu = dst_pmtu(&rt->u.dst); if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); @@ -995,8 +1000,7 @@ #ifdef CONFIG_NETFILTER_DEBUG skb->nf_debug = 1 << NF_IP_LOCAL_OUT; #endif /* CONFIG_NETFILTER_DEBUG */ - skb->nfcache |= NFC_IPVS_PROPERTY; - ip_send(skb); + IP_VS_XMIT(skb, rt); #if 0000 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, Index: net/ipv4/ipvs/ip_vs_core.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/ipvs/ip_vs_core.c,v retrieving revision 1.1.1.7 retrieving revision 1.1.1.7.2.1 diff -u -r1.1.1.7 -r1.1.1.7.2.1 --- a/net/ipv4/ipvs/ip_vs_core.c 28 Nov 2003 18:26:21 -0000 1.1.1.7 +++ b/net/ipv4/ipvs/ip_vs_core.c 16 Apr 2004 13:16:23 -0000 1.1.1.7.2.1 @@ -953,7 +953,7 @@ goto tx_error_icmp; /* MTU checking */ - mtu = rt->u.dst.pmtu; + mtu = dst_pmtu(&rt->u.dst); if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); @@ -1001,7 +1001,7 @@ #ifdef CONFIG_NETFILTER_DEBUG skb->nf_debug = 1 << NF_IP_LOCAL_OUT; #endif /* CONFIG_NETFILTER_DEBUG */ - ip_send(skb); + IP_VS_XMIT(skb, rt); ip_vs_conn_put(cp); return NF_STOLEN; Index: net/ipv4/netfilter/ip_conntrack_standalone.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/netfilter/ip_conntrack_standalone.c,v retrieving revision 1.1.1.22 retrieving revision 1.1.1.22.2.1 diff -u -r1.1.1.22 -r1.1.1.22.2.1 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c 18 Feb 2004 13:36:32 -0000 1.1.1.22 +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c 16 Apr 2004 13:16:23 -0000 1.1.1.22.2.1 @@ -204,7 +204,7 @@ /* Local packets are never produced too large for their interface. We degfragment them at LOCAL_OUT, however, so we have to refragment them here. */ - if ((*pskb)->len > rt->u.dst.pmtu) { + if ((*pskb)->len > dst_pmtu(&rt->u.dst)) { /* No hook can be after us, so this should be OK. */ ip_fragment(*pskb, okfn); return NF_STOLEN; Index: net/ipv4/netfilter/ip_fw_compat_masq.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/netfilter/ip_fw_compat_masq.c,v retrieving revision 1.1.1.18 retrieving revision 1.1.1.18.2.1 diff -u -r1.1.1.18 -r1.1.1.18.2.1 --- a/net/ipv4/netfilter/ip_fw_compat_masq.c 5 Jan 2004 13:53:56 -0000 1.1.1.18 +++ b/net/ipv4/netfilter/ip_fw_compat_masq.c 16 Apr 2004 13:16:23 -0000 1.1.1.18.2.1 @@ -68,12 +68,13 @@ /* Setup the masquerade, if not already */ if (!info->initialized) { u_int32_t newsrc; + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = iph->daddr } } }; struct rtable *rt; struct ip_nat_multi_range range; /* Pass 0 instead of saddr, since it's going to be changed anyway. */ - if (ip_route_output(&rt, iph->daddr, 0, 0, 0) != 0) { + if (ip_route_output_key(&rt, &fl) != 0) { DEBUGP("ipnat_rule_masquerade: Can't reroute.\n"); return NF_DROP; } Index: net/ipv4/netfilter/ip_nat_core.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/netfilter/ip_nat_core.c,v retrieving revision 1.1.1.23 retrieving revision 1.1.1.23.2.1 diff -u -r1.1.1.23 -r1.1.1.23.2.1 --- a/net/ipv4/netfilter/ip_nat_core.c 18 Feb 2004 13:36:32 -0000 1.1.1.23 +++ b/net/ipv4/netfilter/ip_nat_core.c 16 Apr 2004 13:16:23 -0000 1.1.1.23.2.1 @@ -204,10 +204,11 @@ static int do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp) { + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } }; struct rtable *rt; /* FIXME: IPTOS_TOS(iph->tos) --RR */ - if (ip_route_output(&rt, var_ip, 0, 0, 0) != 0) { + if (ip_route_output_key(&rt, &fl) != 0) { DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n", NIPQUAD(var_ip)); return 0; Index: net/ipv4/netfilter/ipt_MASQUERADE.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/netfilter/ipt_MASQUERADE.c,v retrieving revision 1.1.1.18 retrieving revision 1.1.1.18.2.1 diff -u -r1.1.1.18 -r1.1.1.18.2.1 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c 14 Apr 2004 13:05:41 -0000 1.1.1.18 +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c 16 Apr 2004 13:16:23 -0000 1.1.1.18.2.1 @@ -69,7 +69,6 @@ struct ip_nat_multi_range newrange; u_int32_t newsrc; struct rtable *rt; - struct rt_key key; IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); @@ -84,25 +83,28 @@ mr = targinfo; - key.dst = (*pskb)->nh.iph->daddr; - key.src = 0; /* Unknown: that's what we're trying to establish */ - key.tos = RT_TOS((*pskb)->nh.iph->tos)|RTO_CONN; - key.oif = 0; + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = (*pskb)->nh.iph->daddr, + .tos = (RT_TOS((*pskb)->nh.iph->tos) | + RTO_CONN), #ifdef CONFIG_IP_ROUTE_FWMARK - key.fwmark = (*pskb)->nfmark; + .fwmark = (*pskb)->nfmark #endif - if (ip_route_output_key(&rt, &key) != 0) { - /* Funky routing can do this. */ - if (net_ratelimit()) - printk("MASQUERADE:" - " No route: Rusty's brain broke!\n"); - return NF_DROP; - } - if (rt->u.dst.dev != out) { - if (net_ratelimit()) - printk("MASQUERADE:" - " Route sent us somewhere else.\n"); - return NF_DROP; + } } }; + if (ip_route_output_key(&rt, &fl) != 0) { + /* Funky routing can do this. */ + if (net_ratelimit()) + printk("MASQUERADE:" + " No route: Rusty's brain broke!\n"); + return NF_DROP; + } + if (rt->u.dst.dev != out) { + if (net_ratelimit()) + printk("MASQUERADE:" + " Route sent us somewhere else.\n"); + return NF_DROP; + } } newsrc = rt->rt_src; Index: net/ipv4/netfilter/ipt_MIRROR.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/netfilter/ipt_MIRROR.c,v retrieving revision 1.1.1.18 retrieving revision 1.1.1.18.2.1 diff -u -r1.1.1.18 -r1.1.1.18.2.1 --- a/net/ipv4/netfilter/ipt_MIRROR.c 25 Aug 2003 11:44:44 -0000 1.1.1.18 +++ b/net/ipv4/netfilter/ipt_MIRROR.c 16 Apr 2004 13:16:23 -0000 1.1.1.18.2.1 @@ -44,21 +44,21 @@ { struct iphdr *iph = skb->nh.iph; struct dst_entry *odst; - struct rt_key key = {}; + struct flowi fl = {}; struct rtable *rt; if (local) { - key.dst = iph->saddr; - key.src = iph->daddr; - key.tos = RT_TOS(iph->tos); + fl.nl_u.ip4_u.daddr = iph->saddr; + fl.nl_u.ip4_u.saddr = iph->daddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); - if (ip_route_output_key(&rt, &key) != 0) + if (ip_route_output_key(&rt, &fl) != 0) return NULL; } else { /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ - key.dst = iph->daddr; - if (ip_route_output_key(&rt, &key) != 0) + fl.nl_u.ip4_u.daddr = iph->daddr; + if (ip_route_output_key(&rt, &fl) != 0) return NULL; odst = skb->dst; Index: net/ipv4/netfilter/ipt_REJECT.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/netfilter/ipt_REJECT.c,v retrieving revision 1.1.1.23 retrieving revision 1.1.1.23.2.1 diff -u -r1.1.1.23 -r1.1.1.23.2.1 --- a/net/ipv4/netfilter/ipt_REJECT.c 28 Nov 2003 18:26:21 -0000 1.1.1.23 +++ b/net/ipv4/netfilter/ipt_REJECT.c 16 Apr 2004 13:16:23 -0000 1.1.1.23.2.1 @@ -38,22 +38,22 @@ { struct iphdr *iph = skb->nh.iph; struct dst_entry *odst; - struct rt_key key = {}; + struct flowi fl = {}; struct rtable *rt; if (hook != NF_IP_FORWARD) { - key.dst = iph->saddr; + fl.nl_u.ip4_u.daddr = iph->saddr; if (hook == NF_IP_LOCAL_IN) - key.src = iph->daddr; - key.tos = RT_TOS(iph->tos); + fl.nl_u.ip4_u.saddr = iph->daddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); - if (ip_route_output_key(&rt, &key) != 0) + if (ip_route_output_key(&rt, &fl) != 0) return NULL; } else { /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ - key.dst = iph->daddr; - if (ip_route_output_key(&rt, &key) != 0) + fl.nl_u.ip4_u.daddr = iph->daddr; + if (ip_route_output_key(&rt, &fl) != 0) return NULL; odst = skb->dst; @@ -188,7 +188,7 @@ nskb->nh.iph->ihl); /* "Never happens" */ - if (nskb->len > nskb->dst->pmtu) + if (nskb->len > dst_pmtu(nskb->dst)) goto free_nskb; connection_attach(nskb, oldskb->nfct); @@ -268,14 +268,19 @@ tos = (iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL; - if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), 0)) - return; - + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = iph->saddr, + .saddr = saddr, + .tos = RT_TOS(tos) } } }; + if (ip_route_output_key(&rt, &fl)) + return; + } /* RFC says return as much as we can without exceeding 576 bytes. */ length = skb_in->len + sizeof(struct iphdr) + sizeof(struct icmphdr); - if (length > rt->u.dst.pmtu) - length = rt->u.dst.pmtu; + if (length > dst_pmtu(&rt->u.dst)) + length = dst_pmtu(&rt->u.dst); if (length > 576) length = 576; Index: net/ipv4/netfilter/ipt_TCPMSS.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/netfilter/ipt_TCPMSS.c,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/net/ipv4/netfilter/ipt_TCPMSS.c 21 Dec 2001 17:42:05 -0000 1.1.1.15 +++ b/net/ipv4/netfilter/ipt_TCPMSS.c 16 Apr 2004 13:16:23 -0000 1.1.1.15.2.1 @@ -85,14 +85,14 @@ return NF_DROP; /* or IPT_CONTINUE ?? */ } - if((*pskb)->dst->pmtu <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) { + if(dst_pmtu((*pskb)->dst) <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) { if (net_ratelimit()) printk(KERN_ERR - "ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", (*pskb)->dst->pmtu); + "ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", dst_pmtu((*pskb)->dst)); return NF_DROP; /* or IPT_CONTINUE ?? */ } - newmss = (*pskb)->dst->pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr); + newmss = dst_pmtu((*pskb)->dst) - sizeof(struct iphdr) - sizeof(struct tcphdr); } else newmss = tcpmssinfo->mss; Index: net/ipv4/netfilter/ipt_multiport.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv4/netfilter/ipt_multiport.c,v retrieving revision 1.1.1.13 retrieving revision 1.1.1.13.2.1 diff -u -r1.1.1.13 -r1.1.1.13.2.1 --- a/net/ipv4/netfilter/ipt_multiport.c 13 Jun 2003 14:51:39 -0000 1.1.1.13 +++ b/net/ipv4/netfilter/ipt_multiport.c 16 Apr 2004 13:16:23 -0000 1.1.1.13.2.1 @@ -4,6 +4,7 @@ #include #include #include +#include #include #include Index: net/ipv6/Config.in =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/Config.in,v retrieving revision 1.1.1.16 retrieving revision 1.1.1.16.2.1 diff -u -r1.1.1.16 -r1.1.1.16.2.1 --- a/net/ipv6/Config.in 21 Dec 2001 17:42:05 -0000 1.1.1.16 +++ b/net/ipv6/Config.in 16 Apr 2004 13:16:23 -0000 1.1.1.16.2.1 @@ -2,9 +2,14 @@ # IPv6 configuration # -#bool ' IPv6: flow policy support' CONFIG_RT6_POLICY -#bool ' IPv6: firewall support' CONFIG_IPV6_FIREWALL +bool 'IPv6: Privacy Extensions (RFC 3041) support' CONFIG_IPV6_PRIVACY if [ "$CONFIG_NETFILTER" != "n" ]; then source net/ipv6/netfilter/Config.in fi + +tristate 'IPv6: AH transformation' CONFIG_INET6_AH +tristate 'IPv6: ESP transformation' CONFIG_INET6_ESP +tristate 'IPv6: IPComp transformation' CONFIG_INET6_IPCOMP + +tristate 'IPv6: IPv6-in-IPv6 tunnel' CONFIG_IPV6_TUNNEL Index: net/ipv6/Makefile =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/Makefile,v retrieving revision 1.1.1.20 retrieving revision 1.1.1.20.2.1 diff -u -r1.1.1.20 -r1.1.1.20.2.1 --- a/net/ipv6/Makefile 28 Nov 2003 18:26:21 -0000 1.1.1.20 +++ b/net/ipv6/Makefile 16 Apr 2004 13:16:23 -0000 1.1.1.20.2.1 @@ -9,7 +9,13 @@ O_TARGET := ipv6.o -obj-y := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \ +mod-subdirs := netfilter + +ifeq ($(CONFIG_IPV6),m) +obj-m += ipv6.o +endif + +ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ @@ -17,8 +23,29 @@ export-objs := ipv6_syms.o -obj-m := $(O_TARGET) +ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o +ipv6-objs += $(ipv6-y) + +obj-$(CONFIG_INET6_AH) += ah6.o +obj-$(CONFIG_INET6_ESP) += esp6.o +obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o + +obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o -#obj-$(CONFIG_IPV6_FIREWALL) += ip6_fw.o +subdir-$(CONFIG_NETFILTER) += netfilter + +ifeq ($(CONFIG_NETFILTER),y) +obj-y += netfilter/netfilter.o +endif + +ifeq ($(CONFIG_IPV6),y) +obj-y += $(ipv6-objs) +endif include $(TOPDIR)/Rules.make + + +ifeq ($(CONFIG_IPV6),m) +ipv6.o: $(ipv6-objs) + $(LD) -r -o $@ $(ipv6-objs) +endif Index: net/ipv6/addrconf.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/addrconf.c,v retrieving revision 1.1.1.28 retrieving revision 1.1.1.28.2.1 diff -u -r1.1.1.28 -r1.1.1.28.2.1 --- a/net/ipv6/addrconf.c 28 Nov 2003 18:26:21 -0000 1.1.1.28 +++ b/net/ipv6/addrconf.c 16 Apr 2004 13:16:23 -0000 1.1.1.28.2.1 @@ -28,6 +28,8 @@ * packets. * YOSHIFUJI Hideaki @USAGI : improved accuracy of * address validation timer. + * YOSHIFUJI Hideaki @USAGI : Privacy Extensions (RFC3041) + * support. * Yuji SEKIYA @USAGI : Don't assign a same IPv6 * address on a same interface. * YOSHIFUJI Hideaki @USAGI : ARCnet support @@ -66,6 +68,12 @@ #include #include +#ifdef CONFIG_IPV6_PRIVACY +#include +#include +#include +#endif + #include #define IPV6_MAX_ADDRESSES 16 @@ -87,6 +95,18 @@ int inet6_dev_count; int inet6_ifa_count; +#ifdef CONFIG_IPV6_PRIVACY +static int __ipv6_regen_rndid(struct inet6_dev *idev); +static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); +static void ipv6_regen_rndid(unsigned long data); + +static int desync_factor = MAX_DESYNC_FACTOR * HZ; +static struct crypto_tfm *md5_tfm; +static spinlock_t md5_tfm_lock = SPIN_LOCK_UNLOCKED; +#endif + +static int ipv6_count_addresses(struct inet6_dev *idev); + /* * Configured unicast address hash table */ @@ -125,6 +145,13 @@ MAX_RTR_SOLICITATIONS, /* router solicits */ RTR_SOLICITATION_INTERVAL, /* rtr solicit interval */ MAX_RTR_SOLICITATION_DELAY, /* rtr solicit delay */ +#ifdef CONFIG_IPV6_PRIVACY + .use_tempaddr = 0, + .temp_valid_lft = TEMP_VALID_LIFETIME, + .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_max_retry = REGEN_MAX_RETRY, + .max_desync_factor = MAX_DESYNC_FACTOR, +#endif }; static struct ipv6_devconf ipv6_devconf_dflt = @@ -139,6 +166,13 @@ MAX_RTR_SOLICITATIONS, /* router solicits */ RTR_SOLICITATION_INTERVAL, /* rtr solicit interval */ MAX_RTR_SOLICITATION_DELAY, /* rtr solicit delay */ +#ifdef CONFIG_IPV6_PRIVACY + .use_tempaddr = 0, + .temp_valid_lft = TEMP_VALID_LIFETIME, + .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_max_retry = REGEN_MAX_RETRY, + .max_desync_factor = MAX_DESYNC_FACTOR, +#endif }; /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ @@ -170,15 +204,8 @@ }; return type; } - /* check for reserved anycast addresses */ - - if ((st & htonl(0xE0000000)) && - ((addr->s6_addr32[2] == htonl(0xFDFFFFFF) && - (addr->s6_addr32[3] | htonl(0x7F)) == (u32)~0) || - (addr->s6_addr32[2] == 0 && addr->s6_addr32[3] == 0))) - type = IPV6_ADDR_ANYCAST; - else - type = IPV6_ADDR_UNICAST; + + type = IPV6_ADDR_UNICAST; /* Consider all addresses with the first three bits different of 000 and 111 as finished. @@ -299,10 +326,32 @@ /* We refer to the device */ dev_hold(dev); + /* One reference from device. We must do this before + * we invoke __ipv6_regen_rndid(). + */ + in6_dev_hold(ndev); + +#ifdef CONFIG_IPV6_PRIVACY + get_random_bytes(ndev->rndid, sizeof(ndev->rndid)); + get_random_bytes(ndev->entropy, sizeof(ndev->entropy)); + init_timer(&ndev->regen_timer); + ndev->regen_timer.function = ipv6_regen_rndid; + ndev->regen_timer.data = (unsigned long) ndev; + if ((dev->flags&IFF_LOOPBACK) || + dev->type == ARPHRD_TUNNEL || + dev->type == ARPHRD_SIT) { + printk(KERN_INFO + "Disabled Privacy Extensions on device %p(%s)\n", + dev, dev->name); + ndev->cnf.use_tempaddr = -1; + } else { + in6_dev_hold(ndev); + ipv6_regen_rndid((unsigned long) ndev); + } +#endif + write_lock_bh(&addrconf_lock); dev->ip6_ptr = ndev; - /* One reference from device */ - in6_dev_hold(ndev); write_unlock_bh(&addrconf_lock); ipv6_mc_init_dev(ndev); @@ -330,38 +379,6 @@ return idev; } -void ipv6_addr_prefix(struct in6_addr *prefix, - struct in6_addr *addr, int prefix_len) -{ - unsigned long mask; - int ncopy, nbits; - - memset(prefix, 0, sizeof(*prefix)); - - if (prefix_len <= 0) - return; - if (prefix_len > 128) - prefix_len = 128; - - ncopy = prefix_len / 32; - switch (ncopy) { - case 4: prefix->s6_addr32[3] = addr->s6_addr32[3]; - case 3: prefix->s6_addr32[2] = addr->s6_addr32[2]; - case 2: prefix->s6_addr32[1] = addr->s6_addr32[1]; - case 1: prefix->s6_addr32[0] = addr->s6_addr32[0]; - case 0: break; - } - nbits = prefix_len % 32; - if (nbits == 0) - return; - - mask = ~((1 << (32 - nbits)) - 1); - mask = htonl(mask); - - prefix->s6_addr32[ncopy] = addr->s6_addr32[ncopy] & mask; -} - - static void dev_forward_change(struct inet6_dev *idev) { struct net_device *dev; @@ -501,6 +518,18 @@ /* Add to inet6_dev unicast addr list. */ ifa->if_next = idev->addr_list; idev->addr_list = ifa; + +#ifdef CONFIG_IPV6_PRIVACY + ifa->regen_count = 0; + if (ifa->flags&IFA_F_TEMPORARY) { + ifa->tmp_next = idev->tempaddr_list; + idev->tempaddr_list = ifa; + in6_ifa_hold(ifa); + } else { + ifa->tmp_next = NULL; + } +#endif + in6_ifa_hold(ifa); write_unlock_bh(&idev->lock); read_unlock(&addrconf_lock); @@ -523,6 +552,15 @@ ifp->dead = 1; +#ifdef CONFIG_IPV6_PRIVACY + spin_lock_bh(&ifp->lock); + if (ifp->ifpub) { + __in6_ifa_put(ifp->ifpub); + ifp->ifpub = NULL; + } + spin_unlock_bh(&ifp->lock); +#endif + write_lock_bh(&addrconf_hash_lock); for (ifap = &inet6_addr_lst[hash]; (ifa=*ifap) != NULL; ifap = &ifa->lst_next) { @@ -536,6 +574,24 @@ write_unlock_bh(&addrconf_hash_lock); write_lock_bh(&idev->lock); +#ifdef CONFIG_IPV6_PRIVACY + if (ifp->flags&IFA_F_TEMPORARY) { + for (ifap = &idev->tempaddr_list; (ifa=*ifap) != NULL; + ifap = &ifa->tmp_next) { + if (ifa == ifp) { + *ifap = ifa->tmp_next; + if (ifp->ifpub) { + __in6_ifa_put(ifp->ifpub); + ifp->ifpub = NULL; + } + __in6_ifa_put(ifp); + ifa->tmp_next = NULL; + break; + } + } + } +#endif + for (ifap = &idev->addr_list; (ifa=*ifap) != NULL; ifap = &ifa->if_next) { if (ifa == ifp) { @@ -556,6 +612,96 @@ in6_ifa_put(ifp); } +#ifdef CONFIG_IPV6_PRIVACY +static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift) +{ + struct inet6_dev *idev; + struct in6_addr addr, *tmpaddr; + unsigned long tmp_prefered_lft, tmp_valid_lft; + int tmp_plen; + int ret = 0; + + if (ift) { + spin_lock_bh(&ift->lock); + memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8); + spin_unlock_bh(&ift->lock); + tmpaddr = &addr; + } else { + tmpaddr = NULL; + } +retry: + spin_lock_bh(&ifp->lock); + in6_ifa_hold(ifp); + idev = ifp->idev; + in6_dev_hold(idev); + memcpy(addr.s6_addr, ifp->addr.s6_addr, 8); + write_lock(&idev->lock); + if (idev->cnf.use_tempaddr <= 0) { + write_unlock(&idev->lock); + spin_unlock_bh(&ifp->lock); + printk(KERN_INFO + "ipv6_create_tempaddr(): use_tempaddr is disabled.\n"); + in6_dev_put(idev); + in6_ifa_put(ifp); + ret = -1; + goto out; + } + if (ifp->regen_count++ >= idev->cnf.regen_max_retry) { + idev->cnf.use_tempaddr = -1; /*XXX*/ + write_unlock(&idev->lock); + spin_unlock_bh(&ifp->lock); + printk(KERN_WARNING + "ipv6_create_tempaddr(): regeneration time exceeded. disabled temporary address support.\n"); + in6_dev_put(idev); + in6_ifa_put(ifp); + ret = -1; + goto out; + } + if (__ipv6_try_regen_rndid(idev, tmpaddr) < 0) { + write_unlock(&idev->lock); + spin_unlock_bh(&ifp->lock); + printk(KERN_WARNING + "ipv6_create_tempaddr(): regeneration of randomized interface id failed.\n"); + in6_dev_put(idev); + in6_ifa_put(ifp); + ret = -1; + goto out; + } + memcpy(&addr.s6_addr[8], idev->rndid, 8); + tmp_valid_lft = min_t(__u32, + ifp->valid_lft, + idev->cnf.temp_valid_lft); + tmp_prefered_lft = min_t(__u32, + ifp->prefered_lft, + idev->cnf.temp_prefered_lft - desync_factor / HZ); + tmp_plen = ifp->prefix_len; + write_unlock(&idev->lock); + spin_unlock_bh(&ifp->lock); + ift = ipv6_count_addresses(idev) < IPV6_MAX_ADDRESSES ? + ipv6_add_addr(idev, &addr, tmp_plen, + ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, IFA_F_TEMPORARY) : 0; + if (!ift || IS_ERR(ift)) { + in6_dev_put(idev); + in6_ifa_put(ifp); + printk(KERN_INFO + "ipv6_create_tempaddr(): retry temporary address regeneration.\n"); + tmpaddr = &addr; + goto retry; + } + spin_lock_bh(&ift->lock); + ift->ifpub = ifp; + ift->valid_lft = tmp_valid_lft; + ift->prefered_lft = tmp_prefered_lft; + ift->tstamp = ifp->tstamp; + spin_unlock_bh(&ift->lock); + addrconf_dad_start(ift, 0); + in6_ifa_put(ift); + in6_dev_put(idev); +out: + return ret; +} +#endif + /* * Choose an apropriate source address * should do: @@ -564,6 +710,22 @@ * an address of the attached interface * iii) don't use deprecated addresses */ +static int inline ipv6_saddr_pref(const struct inet6_ifaddr *ifp, u8 invpref) +{ + int pref; + pref = ifp->flags&IFA_F_DEPRECATED ? 0 : 2; +#ifdef CONFIG_IPV6_PRIVACY + pref |= (ifp->flags^invpref)&IFA_F_TEMPORARY ? 0 : 1; +#endif + return pref; +} + +#ifdef CONFIG_IPV6_PRIVACY +#define IPV6_GET_SADDR_MAXSCORE(score) ((score) == 3) +#else +#define IPV6_GET_SADDR_MAXSCORE(score) (score) +#endif + int ipv6_dev_get_saddr(struct net_device *dev, struct in6_addr *daddr, struct in6_addr *saddr, int onlink) { @@ -572,6 +734,7 @@ struct inet6_dev *idev; int scope; int err; + int hiscore = -1, score; if (!onlink) @@ -594,17 +757,27 @@ read_lock_bh(&idev->lock); for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { if (ifp->scope == scope) { - if (!(ifp->flags & (IFA_F_DEPRECATED|IFA_F_TENTATIVE))) { - in6_ifa_hold(ifp); + if (ifp->flags&IFA_F_TENTATIVE) + continue; +#ifdef CONFIG_IPV6_PRIVACY + score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0); +#else + score = ipv6_saddr_pref(ifp, 0); +#endif + if (score <= hiscore) + continue; + + if (match) + in6_ifa_put(match); + match = ifp; + hiscore = score; + in6_ifa_hold(ifp); + + if (IPV6_GET_SADDR_MAXSCORE(score)) { read_unlock_bh(&idev->lock); read_unlock(&addrconf_lock); goto out; } - - if (!match && !(ifp->flags & IFA_F_TENTATIVE)) { - match = ifp; - in6_ifa_hold(ifp); - } } } read_unlock_bh(&idev->lock); @@ -627,16 +800,26 @@ read_lock_bh(&idev->lock); for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { if (ifp->scope == scope) { - if (!(ifp->flags&(IFA_F_DEPRECATED|IFA_F_TENTATIVE))) { - in6_ifa_hold(ifp); + if (ifp->flags&IFA_F_TENTATIVE) + continue; +#ifdef CONFIG_IPV6_PRIVACY + score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0); +#else + score = ipv6_saddr_pref(ifp, 0); +#endif + if (score <= hiscore) + continue; + + if (match) + in6_ifa_put(match); + match = ifp; + hiscore = score; + in6_ifa_hold(ifp); + + if (IPV6_GET_SADDR_MAXSCORE(score)) { read_unlock_bh(&idev->lock); goto out_unlock_base; } - - if (!match && !(ifp->flags&IFA_F_TENTATIVE)) { - match = ifp; - in6_ifa_hold(ifp); - } } } read_unlock_bh(&idev->lock); @@ -648,24 +831,16 @@ read_unlock(&dev_base_lock); out: - if (ifp == NULL) { - ifp = match; - match = NULL; - } - err = -EADDRNOTAVAIL; - if (ifp) { - ipv6_addr_copy(saddr, &ifp->addr); + if (match) { + ipv6_addr_copy(saddr, &match->addr); err = 0; - in6_ifa_put(ifp); - } - if (match) in6_ifa_put(match); + } return err; } - int ipv6_get_saddr(struct dst_entry *dst, struct in6_addr *daddr, struct in6_addr *saddr) { @@ -706,7 +881,7 @@ return err; } -int ipv6_count_addresses(struct inet6_dev *idev) +static int ipv6_count_addresses(struct inet6_dev *idev) { int cnt = 0; struct inet6_ifaddr *ifp; @@ -785,6 +960,21 @@ ifp->flags |= IFA_F_TENTATIVE; spin_unlock_bh(&ifp->lock); in6_ifa_put(ifp); +#ifdef CONFIG_IPV6_PRIVACY + } else if (ifp->flags&IFA_F_TEMPORARY) { + struct inet6_ifaddr *ifpub; + spin_lock_bh(&ifp->lock); + ifpub = ifp->ifpub; + if (ifpub) { + in6_ifa_hold(ifpub); + spin_unlock_bh(&ifp->lock); + ipv6_create_tempaddr(ifpub, ifp); + in6_ifa_put(ifpub); + } else { + spin_unlock_bh(&ifp->lock); + } + ipv6_del_addr(ifp); +#endif } else ipv6_del_addr(ifp); } @@ -857,6 +1047,110 @@ return err; } +#ifdef CONFIG_IPV6_PRIVACY +/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */ +static int __ipv6_regen_rndid(struct inet6_dev *idev) +{ + struct net_device *dev; + struct scatterlist sg[2]; + + sg[0].page = virt_to_page(idev->entropy); + sg[0].offset = ((long) idev->entropy & ~PAGE_MASK); + sg[0].length = 8; + sg[1].page = virt_to_page(idev->work_eui64); + sg[1].offset = ((long) idev->work_eui64 & ~PAGE_MASK); + sg[1].length = 8; + + dev = idev->dev; + + if (ipv6_generate_eui64(idev->work_eui64, dev)) { + printk(KERN_INFO + "__ipv6_regen_rndid(idev=%p): cannot get EUI64 identifier; use random bytes.\n", + idev); + get_random_bytes(idev->work_eui64, sizeof(idev->work_eui64)); + } +regen: + spin_lock(&md5_tfm_lock); + if (unlikely(md5_tfm == NULL)) { + spin_unlock(&md5_tfm_lock); + return -1; + } + crypto_digest_init(md5_tfm); + crypto_digest_update(md5_tfm, sg, 2); + crypto_digest_final(md5_tfm, idev->work_digest); + spin_unlock(&md5_tfm_lock); + + memcpy(idev->rndid, &idev->work_digest[0], 8); + idev->rndid[0] &= ~0x02; + memcpy(idev->entropy, &idev->work_digest[8], 8); + + /* + * : + * check if generated address is not inappropriate + * + * - Reserved subnet anycast (RFC 2526) + * 11111101 11....11 1xxxxxxx + * - ISATAP (draft-ietf-ngtrans-isatap-01.txt) 4.3 + * 00-00-5E-FE-xx-xx-xx-xx + * - value 0 + * - XXX: already assigned to an address on the device + */ + if (idev->rndid[0] == 0xfd && + (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) && + (idev->rndid[7]&0x80)) + goto regen; + if ((idev->rndid[0]|idev->rndid[1]) == 0) { + if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe) + goto regen; + if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00) + goto regen; + } + + return 0; +} + +static void ipv6_regen_rndid(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *) data; + unsigned long expires; + + read_lock_bh(&addrconf_lock); + write_lock_bh(&idev->lock); + + if (idev->dead) + goto out; + + if (__ipv6_regen_rndid(idev) < 0) + goto out; + + expires = jiffies + + idev->cnf.temp_prefered_lft * HZ - + idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time - desync_factor; + if (time_before(expires, jiffies)) { + printk(KERN_WARNING + "ipv6_regen_rndid(): too short regeneration interval; timer disabled for %s.\n", + idev->dev->name); + goto out; + } + + if (!mod_timer(&idev->regen_timer, expires)) + in6_dev_hold(idev); + +out: + write_unlock_bh(&idev->lock); + read_unlock_bh(&addrconf_lock); + in6_dev_put(idev); +} + +static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) { + int ret = 0; + + if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0) + ret = __ipv6_regen_rndid(idev); + return ret; +} +#endif + /* * Add prefix route. */ @@ -883,7 +1177,7 @@ if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT)) rtmsg.rtmsg_flags |= RTF_NONEXTHOP; - ip6_route_add(&rtmsg, NULL); + ip6_route_add(&rtmsg, NULL, NULL); } /* Create "default" multicast route to the interface */ @@ -900,7 +1194,7 @@ rtmsg.rtmsg_ifindex = dev->ifindex; rtmsg.rtmsg_flags = RTF_UP; rtmsg.rtmsg_type = RTMSG_NEWROUTE; - ip6_route_add(&rtmsg, NULL); + ip6_route_add(&rtmsg, NULL, NULL); } static void sit_route_add(struct net_device *dev) @@ -917,7 +1211,7 @@ rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP; rtmsg.rtmsg_ifindex = dev->ifindex; - ip6_route_add(&rtmsg, NULL); + ip6_route_add(&rtmsg, NULL, NULL); } static void addrconf_add_lroute(struct net_device *dev) @@ -948,7 +1242,6 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) { struct prefix_info *pinfo; - struct rt6_info *rt; __u32 valid_lft; __u32 prefered_lft; int addr_type; @@ -1004,32 +1297,33 @@ else rt_expires = jiffies + valid_lft * HZ; - rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, 1); - - if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { - if (rt->rt6i_flags&RTF_EXPIRES) { - if (pinfo->onlink == 0 || valid_lft == 0) { - ip6_del_rt(rt, NULL); - rt = NULL; - } else { - rt->rt6i_expires = rt_expires; + if (pinfo->onlink) { + struct rt6_info *rt; + rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, 1); + + if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { + if (rt->rt6i_flags&RTF_EXPIRES) { + if (valid_lft == 0) { + ip6_del_rt(rt, NULL, NULL); + rt = NULL; + } else { + rt->rt6i_expires = rt_expires; + } } + } else if (valid_lft) { + addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, + dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT); } - } else if (pinfo->onlink && valid_lft) { - addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, - dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT); + if (rt) + dst_release(&rt->u.dst); } - if (rt) - dst_release(&rt->u.dst); /* Try to figure out our local address for this prefix */ if (pinfo->autoconf && in6_dev->cnf.autoconf) { struct inet6_ifaddr * ifp; struct in6_addr addr; - int plen; - - plen = pinfo->prefix_len >> 3; + int create = 0, update_lft = 0; if (pinfo->prefix_len == 64) { memcpy(&addr, &pinfo->prefix, 8); @@ -1058,33 +1352,95 @@ ifp = ipv6_add_addr(in6_dev, &addr, pinfo->prefix_len, addr_type&IPV6_ADDR_SCOPE_MASK, 0); - if (IS_ERR(ifp)) { + if (!ifp || IS_ERR(ifp)) { in6_dev_put(in6_dev); return; } + update_lft = create = 1; addrconf_dad_start(ifp, RTF_ADDRCONF|RTF_PREFIX_RT); } - if (ifp && valid_lft == 0) { - ipv6_del_addr(ifp); - ifp = NULL; - } - if (ifp) { int flags; + unsigned long now; +#ifdef CONFIG_IPV6_PRIVACY + struct inet6_ifaddr *ift; +#endif + u32 stored_lft; + /* update lifetime (RFC2462 5.5.3 e) */ spin_lock(&ifp->lock); - ifp->valid_lft = valid_lft; - ifp->prefered_lft = prefered_lft; - ifp->tstamp = jiffies; - flags = ifp->flags; - ifp->flags &= ~IFA_F_DEPRECATED; - spin_unlock(&ifp->lock); - - if (!(flags&IFA_F_TENTATIVE)) - ipv6_ifa_notify((flags&IFA_F_DEPRECATED) ? - 0 : RTM_NEWADDR, ifp); + now = jiffies; + if (ifp->valid_lft > (now - ifp->tstamp) / HZ) + stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; + else + stored_lft = 0; + if (!update_lft && stored_lft) { + if (valid_lft > MIN_VALID_LIFETIME || + valid_lft > stored_lft) + update_lft = 1; + else if (stored_lft <= MIN_VALID_LIFETIME) { + /* valid_lft <= stored_lft is always true */ + /* XXX: IPsec */ + update_lft = 0; + } else { + valid_lft = MIN_VALID_LIFETIME; + if (valid_lft < prefered_lft) + prefered_lft = valid_lft; + update_lft = 1; + } + } + + if (update_lft) { + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + ifp->tstamp = now; + flags = ifp->flags; + ifp->flags &= ~IFA_F_DEPRECATED; + spin_unlock(&ifp->lock); + + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify((flags&IFA_F_DEPRECATED) ? + 0 : RTM_NEWADDR, ifp); + } else + spin_unlock(&ifp->lock); + +#ifdef CONFIG_IPV6_PRIVACY + read_lock_bh(&in6_dev->lock); + /* update all temporary addresses in the list */ + for (ift=in6_dev->tempaddr_list; ift; ift=ift->tmp_next) { + /* + * When adjusting the lifetimes of an existing + * temporary address, only lower the lifetimes. + * Implementations must not increase the + * lifetimes of an existing temporary address + * when processing a Prefix Information Option. + */ + spin_lock(&ift->lock); + flags = ift->flags; + if (ift->valid_lft > valid_lft && + ift->valid_lft - valid_lft > (jiffies - ift->tstamp) / HZ) + ift->valid_lft = valid_lft + (jiffies - ift->tstamp) / HZ; + if (ift->prefered_lft > prefered_lft && + ift->prefered_lft - prefered_lft > (jiffies - ift->tstamp) / HZ) + ift->prefered_lft = prefered_lft + (jiffies - ift->tstamp) / HZ; + spin_unlock(&ift->lock); + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ift); + } + + if (create && in6_dev->cnf.use_tempaddr > 0) { + /* + * When a new public address is created as described in [ADDRCONF], + * also create a new temporary address. + */ + read_unlock_bh(&in6_dev->lock); + ipv6_create_tempaddr(ifp, NULL); + } else { + read_unlock_bh(&in6_dev->lock); + } +#endif in6_ifa_put(ifp); addrconf_verify(0); } @@ -1515,6 +1871,27 @@ /* Step 3: clear address list */ write_lock_bh(&idev->lock); +#ifdef CONFIG_IPV6_PRIVACY + if (how == 1 && del_timer(&idev->regen_timer)) + in6_dev_put(idev); + + /* clear tempaddr list */ + while ((ifa = idev->tempaddr_list) != NULL) { + idev->tempaddr_list = ifa->tmp_next; + ifa->tmp_next = NULL; + ifa->dead = 1; + write_unlock_bh(&idev->lock); + spin_lock_bh(&ifa->lock); + + if (ifa->ifpub) { + in6_ifa_put(ifa->ifpub); + ifa->ifpub = NULL; + } + spin_unlock_bh(&ifa->lock); + in6_ifa_put(ifa); + write_lock_bh(&idev->lock); + } +#endif while ((ifa = idev->addr_list) != NULL) { idev->addr_list = ifa->if_next; ifa->if_next = NULL; @@ -1539,10 +1916,11 @@ /* Shot the device (if unregistered) */ if (how == 1) { - neigh_parms_release(&nd_tbl, idev->nd_parms); #ifdef CONFIG_SYSCTL addrconf_sysctl_unregister(&idev->cnf); + neigh_sysctl_unregister(idev->nd_parms); #endif + neigh_parms_release(&nd_tbl, idev->nd_parms); in6_dev_put(idev); } return 0; @@ -1592,7 +1970,7 @@ rtmsg.rtmsg_ifindex = ifp->idev->dev->ifindex; - ip6_route_add(&rtmsg, NULL); + ip6_route_add(&rtmsg, NULL, NULL); } out: @@ -1612,7 +1990,8 @@ addrconf_join_solict(dev, &ifp->addr); if (ifp->prefix_len != 128 && (ifp->flags&IFA_F_PERMANENT)) - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 0, flags); + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 0, + flags); net_srandom(ifp->addr.s6_addr32[3]); rand_num = net_random() % (ifp->idev->cnf.rtr_solicit_delay ? : 1); @@ -1787,6 +2166,9 @@ write_lock(&addrconf_hash_lock); for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) { unsigned long age; +#ifdef CONFIG_IPV6_PRIVACY + unsigned long regen_advance; +#endif if (ifp->flags & IFA_F_PERMANENT) continue; @@ -1794,6 +2176,12 @@ spin_lock(&ifp->lock); age = (now - ifp->tstamp) / HZ; +#ifdef CONFIG_IPV6_PRIVACY + regen_advance = ifp->idev->cnf.regen_max_retry * + ifp->idev->cnf.dad_transmits * + ifp->idev->nd_parms->retrans_time / HZ; +#endif + if (age >= ifp->valid_lft) { spin_unlock(&ifp->lock); in6_ifa_hold(ifp); @@ -1822,6 +2210,28 @@ in6_ifa_put(ifp); goto restart; } +#ifdef CONFIG_IPV6_PRIVACY + } else if ((ifp->flags&IFA_F_TEMPORARY) && + !(ifp->flags&IFA_F_TENTATIVE)) { + if (age >= ifp->prefered_lft - regen_advance) { + struct inet6_ifaddr *ifpub = ifp->ifpub; + if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ; + if (!ifp->regen_count && ifpub) { + ifp->regen_count++; + in6_ifa_hold(ifp); + in6_ifa_hold(ifpub); + spin_unlock(&ifp->lock); + write_unlock(&addrconf_hash_lock); + ipv6_create_tempaddr(ifpub, ifp); + in6_ifa_put(ifpub); + in6_ifa_put(ifp); + goto restart; + } + } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ; + spin_unlock(&ifp->lock); +#endif } else { /* ifp->prefered_lft <= ifp->valid_lft */ if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) @@ -2106,7 +2516,7 @@ switch (event) { case RTM_NEWADDR: - ip6_rt_addr_add(&ifp->addr, ifp->idev->dev); + ip6_rt_addr_add(&ifp->addr, ifp->idev->dev, 0); break; case RTM_DELADDR: addrconf_leave_solict(ifp->idev->dev, &ifp->addr); @@ -2157,7 +2567,7 @@ static struct addrconf_sysctl_table { struct ctl_table_header *sysctl_header; - ctl_table addrconf_vars[11]; + ctl_table addrconf_vars[16]; ctl_table addrconf_dev[2]; ctl_table addrconf_conf_dir[2]; ctl_table addrconf_proto_dir[2]; @@ -2204,6 +2614,28 @@ &ipv6_devconf.rtr_solicit_delay, sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, +#ifdef CONFIG_IPV6_PRIVACY + {NET_IPV6_USE_TEMPADDR, "use_tempaddr", + &ipv6_devconf.use_tempaddr, sizeof(int), 0644, NULL, + &proc_dointvec}, + + {NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft", + &ipv6_devconf.temp_valid_lft, sizeof(int), 0644, NULL, + &proc_dointvec}, + + {NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft", + &ipv6_devconf.temp_prefered_lft, sizeof(int), 0644, NULL, + &proc_dointvec}, + + {NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry", + &ipv6_devconf.regen_max_retry, sizeof(int), 0644, NULL, + &proc_dointvec}, + + {NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor", + &ipv6_devconf.max_desync_factor, sizeof(int), 0644, NULL, + &proc_dointvec}, +#endif + {0}}, {{NET_PROTO_CONF_ALL, "all", NULL, 0, 0555, addrconf_sysctl.addrconf_vars},{0}}, @@ -2222,7 +2654,7 @@ if (t == NULL) return; memcpy(t, &addrconf_sysctl, sizeof(*t)); - for (i=0; iaddrconf_vars)/sizeof(t->addrconf_vars[0])-1; i++) { + for (i=0; t->addrconf_vars[i].data; i++) { t->addrconf_vars[i].data += (char*)p - (char*)&ipv6_devconf; t->addrconf_vars[i].de = NULL; t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */ @@ -2285,7 +2717,16 @@ { #ifdef MODULE struct net_device *dev; +#endif +#ifdef CONFIG_IPV6_PRIVACY + md5_tfm = crypto_alloc_tfm("md5", 0); + if (unlikely(md5_tfm == NULL)) + printk(KERN_WARNING + "failed to load transform for md5\n"); +#endif + +#ifdef MODULE /* This takes sense only during module load. */ rtnl_lock(); for (dev = dev_base; dev; dev = dev->next) { @@ -2371,6 +2812,13 @@ rtnl_unlock(); +#ifdef CONFIG_IPV6_PRIVACY + if (likely(md5_tfm != NULL)) { + crypto_free_tfm(md5_tfm); + md5_tfm = NULL; + } +#endif + #ifdef CONFIG_PROC_FS proc_net_remove("if_inet6"); #endif Index: net/ipv6/af_inet6.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/af_inet6.c,v retrieving revision 1.1.1.21 retrieving revision 1.1.1.21.2.1 diff -u -r1.1.1.21 -r1.1.1.21.2.1 --- a/net/ipv6/af_inet6.c 28 Nov 2003 18:26:21 -0000 1.1.1.21 +++ b/net/ipv6/af_inet6.c 16 Apr 2004 13:16:24 -0000 1.1.1.21.2.1 @@ -58,6 +58,9 @@ #include #include #include +#if CONFIG_IPV6_TUNNEL +#include +#endif #include #include @@ -181,7 +184,7 @@ /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. */ - sk->protinfo.af_inet.ttl = 64; + sk->protinfo.af_inet.uc_ttl = -1; sk->protinfo.af_inet.mc_loop = 1; sk->protinfo.af_inet.mc_ttl = 1; @@ -630,6 +633,11 @@ */ inet6_register_protosw(&rawv6_protosw); + /* Register the family here so that the init calls below will + * be able to create sockets. (?? is this dangerous ??) + */ + (void) sock_register(&inet6_family_ops); + /* * ipngwg API draft makes clear that the correct semantics * for TCP and UDP is to consider one TCP and UDP instance @@ -646,6 +654,11 @@ err = ndisc_init(&inet6_family_ops); if (err) goto ndisc_fail; +#ifdef CONFIG_IPV6_TUNNEL + err = ip6_tunnel_init(); + if (err) + goto ip6_tunnel_fail; +#endif err = igmp6_init(&inet6_family_ops); if (err) goto igmp_fail; @@ -671,15 +684,17 @@ ip6_flowlabel_init(); addrconf_init(); sit_init(); + + /* Init v6 extention headers. */ + ipv6_rthdr_init(); ipv6_frag_init(); + ipv6_nodata_init(); + ipv6_destopt_init(); /* Init v6 transport protocols. */ udpv6_init(); tcpv6_init(); - /* Now the userspace is allowed to create INET6 sockets. */ - (void) sock_register(&inet6_family_ops); - return 0; #ifdef CONFIG_PROC_FS @@ -697,6 +712,10 @@ igmp6_cleanup(); #endif igmp_fail: +#ifdef CONFIG_IPV6_TUNNEL + ip6_tunnel_cleanup(); +ip6_tunnel_fail: +#endif ndisc_cleanup(); ndisc_fail: icmpv6_cleanup(); @@ -730,6 +749,9 @@ ip6_route_cleanup(); ipv6_packet_cleanup(); igmp6_cleanup(); +#ifdef CONFIG_IPV6_TUNNEL + ip6_tunnel_cleanup(); +#endif ndisc_cleanup(); icmpv6_cleanup(); #ifdef CONFIG_SYSCTL Index: net/ipv6/ah6.c =================================================================== RCS file: net/ipv6/ah6.c diff -N net/ipv6/ah6.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv6/ah6.c 16 Apr 2004 13:16:24 -0000 1.8.2.1 @@ -0,0 +1,521 @@ +/* + * Copyright (C)2002 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors + * + * Mitsuru KANDA @USAGI : IPv6 Support + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro + * + * This file is derived from net/ipv4/ah.c. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* XXX no ipv6 ah specific */ +#define NIP6(addr) \ + ntohs((addr).s6_addr16[0]),\ + ntohs((addr).s6_addr16[1]),\ + ntohs((addr).s6_addr16[2]),\ + ntohs((addr).s6_addr16[3]),\ + ntohs((addr).s6_addr16[4]),\ + ntohs((addr).s6_addr16[5]),\ + ntohs((addr).s6_addr16[6]),\ + ntohs((addr).s6_addr16[7]) + +static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) +{ + u8 *opt = (u8 *)opthdr; + int len = ipv6_optlen(opthdr); + int off = 0; + int optlen = 0; + + off += 2; + len -= 2; + + while (len > 0) { + + switch (opt[off]) { + + case IPV6_TLV_PAD0: + optlen = 1; + break; + default: + if (len < 2) + goto bad; + optlen = opt[off+1]+2; + if (len < optlen) + goto bad; + if (opt[off] & 0x20) + memset(&opt[off+2], 0, opt[off+1]); + break; + } + + off += optlen; + len -= optlen; + } + if (len == 0) + return 1; + +bad: + return 0; +} + +static int ipv6_clear_mutable_options(struct sk_buff *skb, u16 *nh_offset, int dir) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + unsigned int packet_len = skb->tail - skb->nh.raw; + u8 nexthdr = skb->nh.ipv6h->nexthdr; + u8 nextnexthdr = 0; + + *nh_offset = ((unsigned char *)&skb->nh.ipv6h->nexthdr) - skb->nh.raw; + + while (offset + 1 <= packet_len) { + + switch (nexthdr) { + + case NEXTHDR_HOP: + *nh_offset = offset; + offset += ipv6_optlen(exthdr); + if (!zero_out_mutable_opts(exthdr)) { + if (net_ratelimit()) + printk(KERN_WARNING "overrun hopopts\n"); + return 0; + } + nexthdr = exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + break; + + case NEXTHDR_ROUTING: + *nh_offset = offset; + offset += ipv6_optlen(exthdr); + ((struct ipv6_rt_hdr*)exthdr)->segments_left = 0; + nexthdr = exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + break; + + case NEXTHDR_DEST: + *nh_offset = offset; + offset += ipv6_optlen(exthdr); + if (!zero_out_mutable_opts(exthdr)) { + if (net_ratelimit()) + printk(KERN_WARNING "overrun destopt\n"); + return 0; + } + nexthdr = exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + break; + + case NEXTHDR_AUTH: + if (dir == XFRM_POLICY_OUT) { + memset(((struct ipv6_auth_hdr*)exthdr)->auth_data, 0, + (((struct ipv6_auth_hdr*)exthdr)->hdrlen - 1) << 2); + } + if (exthdr->nexthdr == NEXTHDR_DEST) { + offset += (((struct ipv6_auth_hdr*)exthdr)->hdrlen + 2) << 2; + exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + nextnexthdr = exthdr->nexthdr; + if (!zero_out_mutable_opts(exthdr)) { + if (net_ratelimit()) + printk(KERN_WARNING "overrun destopt\n"); + return 0; + } + } + return nexthdr; + default : + return nexthdr; + } + } + + return nexthdr; +} + +int ah6_output(struct sk_buff *skb) +{ + int err; + int hdr_len = sizeof(struct ipv6hdr); + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct ipv6hdr *iph = NULL; + struct ip_auth_hdr *ah; + struct ah_data *ahp; + u16 nh_offset = 0; + u8 nexthdr; + + if (skb->ip_summed == CHECKSUM_HW && skb_checksum_help(skb) == NULL) { + err = -EINVAL; + goto error_nolock; + } + + spin_lock_bh(&x->lock); + err = xfrm_check_output(x, skb, AF_INET); + if (err) + goto error; + + if (x->props.mode) { + iph = skb->nh.ipv6h; + skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, x->props.header_len); + skb->nh.ipv6h->version = 6; + skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb->nh.ipv6h->nexthdr = IPPROTO_AH; + memcpy(&skb->nh.ipv6h->saddr, &x->props.saddr, sizeof(struct in6_addr)); + memcpy(&skb->nh.ipv6h->daddr, &x->id.daddr, sizeof(struct in6_addr)); + ah = (struct ip_auth_hdr*)(skb->nh.ipv6h+1); + ah->nexthdr = IPPROTO_IPV6; + } else { + hdr_len = skb->h.raw - skb->nh.raw; + iph = kmalloc(hdr_len, GFP_ATOMIC); + if (!iph) { + err = -ENOMEM; + goto error; + } + memcpy(iph, skb->data, hdr_len); + skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, x->props.header_len); + memcpy(skb->nh.ipv6h, iph, hdr_len); + nexthdr = ipv6_clear_mutable_options(skb, &nh_offset, XFRM_POLICY_OUT); + if (nexthdr == 0) + goto error; + + skb->nh.raw[nh_offset] = IPPROTO_AH; + skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + ah = (struct ip_auth_hdr*)(skb->nh.raw+hdr_len); + skb->h.raw = (unsigned char*) ah; + ah->nexthdr = nexthdr; + } + + skb->nh.ipv6h->priority = 0; + skb->nh.ipv6h->flow_lbl[0] = 0; + skb->nh.ipv6h->flow_lbl[1] = 0; + skb->nh.ipv6h->flow_lbl[2] = 0; + skb->nh.ipv6h->hop_limit = 0; + + ahp = x->data; + ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + + ahp->icv_trunc_len) >> 2) - 2; + + ah->reserved = 0; + ah->spi = x->id.spi; + ah->seq_no = htonl(++x->replay.oseq); + ahp->icv(ahp, skb, ah->auth_data); + + if (x->props.mode) { + skb->nh.ipv6h->hop_limit = iph->hop_limit; + skb->nh.ipv6h->priority = iph->priority; + skb->nh.ipv6h->flow_lbl[0] = iph->flow_lbl[0]; + skb->nh.ipv6h->flow_lbl[1] = iph->flow_lbl[1]; + skb->nh.ipv6h->flow_lbl[2] = iph->flow_lbl[2]; + if (x->props.flags & XFRM_STATE_NOECN) + IP6_ECN_clear(skb->nh.ipv6h); + } else { + memcpy(skb->nh.ipv6h, iph, hdr_len); + skb->nh.raw[nh_offset] = IPPROTO_AH; + skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + kfree (iph); + } + + skb->nh.raw = skb->data; + + x->curlft.bytes += skb->len; + x->curlft.packets++; + spin_unlock_bh(&x->lock); + if ((skb->dst = dst_pop(dst)) == NULL) { + err = -EHOSTUNREACH; + goto error_nolock; + } + return NET_XMIT_BYPASS; +error: + spin_unlock_bh(&x->lock); +error_nolock: + kfree_skb(skb); + return err; +} + +int ah6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + /* + * Before process AH + * [IPv6][Ext1][Ext2][AH][Dest][Payload] + * |<-------------->| hdr_len + * |<------------------------>| cleared_hlen + * + * To erase AH: + * Keeping copy of cleared headers. After AH processing, + * Moving the pointer of skb->nh.raw by using skb_pull as long as AH + * header length. Then copy back the copy as long as hdr_len + * If destination header following AH exists, copy it into after [Ext2]. + * + * |<>|[IPv6][Ext1][Ext2][Dest][Payload] + * There is offset of AH before IPv6 header after the process. + */ + + struct ipv6_auth_hdr *ah; + struct ah_data *ahp; + unsigned char *tmp_hdr = NULL; + u16 hdr_len; + u16 ah_hlen; + u16 cleared_hlen; + u16 nh_offset = 0; + u8 nexthdr = 0; + u8 *prevhdr; + + if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) + goto out; + + /* We are going to _remove_ AH header to keep sockets happy, + * so... Later this can change. */ + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto out; + + hdr_len = skb->data - skb->nh.raw; + cleared_hlen = hdr_len; + ah = (struct ipv6_auth_hdr*)skb->data; + ahp = x->data; + nexthdr = ah->nexthdr; + ah_hlen = (ah->hdrlen + 2) << 2; + cleared_hlen += ah_hlen; + + if (nexthdr == NEXTHDR_DEST) { + struct ipv6_opt_hdr *dsthdr = (struct ipv6_opt_hdr*)(skb->data + ah_hlen); + cleared_hlen += ipv6_optlen(dsthdr); + } + + if (ah_hlen != XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_trunc_len)) + goto out; + + if (!pskb_may_pull(skb, ah_hlen)) + goto out; + + tmp_hdr = kmalloc(cleared_hlen, GFP_ATOMIC); + if (!tmp_hdr) + goto out; + memcpy(tmp_hdr, skb->nh.raw, cleared_hlen); + ipv6_clear_mutable_options(skb, &nh_offset, XFRM_POLICY_IN); + skb->nh.ipv6h->priority = 0; + skb->nh.ipv6h->flow_lbl[0] = 0; + skb->nh.ipv6h->flow_lbl[1] = 0; + skb->nh.ipv6h->flow_lbl[2] = 0; + skb->nh.ipv6h->hop_limit = 0; + + { + u8 auth_data[MAX_AH_AUTH_LEN]; + + memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); + memset(ah->auth_data, 0, ahp->icv_trunc_len); + skb_push(skb, skb->data - skb->nh.raw); + ahp->icv(ahp, skb, ah->auth_data); + if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { + if (net_ratelimit()) + printk(KERN_WARNING "ipsec ah authentication error\n"); + x->stats.integrity_failed++; + goto free_out; + } + } + + skb->nh.raw = skb_pull(skb, ah_hlen); + memcpy(skb->nh.raw, tmp_hdr, hdr_len); + if (nexthdr == NEXTHDR_DEST) { + memcpy(skb->nh.raw + hdr_len, + tmp_hdr + hdr_len + ah_hlen, + cleared_hlen - hdr_len - ah_hlen); + } + prevhdr = (u8*)(skb->nh.raw + nh_offset); + *prevhdr = nexthdr; + skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_pull(skb, hdr_len); + skb->h.raw = skb->data; + + + kfree(tmp_hdr); + + return nexthdr; + +free_out: + kfree(tmp_hdr); +out: + return -EINVAL; +} + +void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH || + type != ICMPV6_PKT_TOOBIG) + return; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6); + if (!x) + return; + + printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/" + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + ntohl(ah->spi), NIP6(iph->daddr)); + + xfrm_state_put(x); +} + +static int ah6_init_state(struct xfrm_state *x, void *args) +{ + struct ah_data *ahp = NULL; + struct xfrm_algo_desc *aalg_desc; + + if (!x->aalg) + goto error; + + /* null auth can use a zero length key */ + if (x->aalg->alg_key_len > 512) + goto error; + + ahp = kmalloc(sizeof(*ahp), GFP_KERNEL); + if (ahp == NULL) + return -ENOMEM; + + memset(ahp, 0, sizeof(*ahp)); + + ahp->key = x->aalg->alg_key; + ahp->key_len = (x->aalg->alg_key_len+7)/8; + ahp->tfm = crypto_alloc_tfm(x->aalg->alg_name, 0); + if (!ahp->tfm) + goto error; + ahp->icv = ah_hmac_digest; + + /* + * Lookup the algorithm description maintained by xfrm_algo, + * verify crypto transform properties, and store information + * we need for AH processing. This lookup cannot fail here + * after a successful crypto_alloc_tfm(). + */ + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_tfm_alg_digestsize(ahp->tfm)) { + printk(KERN_INFO "AH: %s digestsize %u != %hu\n", + x->aalg->alg_name, crypto_tfm_alg_digestsize(ahp->tfm), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto error; + } + + ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + + BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); + + ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL); + if (!ahp->work_icv) + goto error; + + x->props.header_len = XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_trunc_len); + if (x->props.mode) + x->props.header_len += sizeof(struct ipv6hdr); + x->data = ahp; + + return 0; + +error: + if (ahp) { + if (ahp->work_icv) + kfree(ahp->work_icv); + if (ahp->tfm) + crypto_free_tfm(ahp->tfm); + kfree(ahp); + } + return -EINVAL; +} + +static void ah6_destroy(struct xfrm_state *x) +{ + struct ah_data *ahp = x->data; + + if (!ahp) + return; + + if (ahp->work_icv) { + kfree(ahp->work_icv); + ahp->work_icv = NULL; + } + if (ahp->tfm) { + crypto_free_tfm(ahp->tfm); + ahp->tfm = NULL; + } + kfree(ahp); +} + +static struct xfrm_type ah6_type = +{ + .description = "AH6", + .owner = THIS_MODULE, + .proto = IPPROTO_AH, + .init_state = ah6_init_state, + .destructor = ah6_destroy, + .input = ah6_input, + .output = ah6_output +}; + +static struct inet6_protocol ah6_protocol = { + .handler = xfrm6_rcv, + .err_handler = ah6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +int __init ah6_init(void) +{ + if (xfrm_register_type(&ah6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipv6 ah init: can't add xfrm type\n"); + return -EAGAIN; + } + + if (inet6_add_protocol(&ah6_protocol, IPPROTO_AH) < 0) { + printk(KERN_INFO "ipv6 ah init: can't add protocol\n"); + xfrm_unregister_type(&ah6_type, AF_INET6); + return -EAGAIN; + } + + return 0; +} + +static void __exit ah6_fini(void) +{ + if (inet6_del_protocol(&ah6_protocol, IPPROTO_AH) < 0) + printk(KERN_INFO "ipv6 ah close: can't remove protocol\n"); + + if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 ah close: can't remove xfrm type\n"); + +} + +module_init(ah6_init); +module_exit(ah6_fini); + +MODULE_LICENSE("GPL"); Index: net/ipv6/anycast.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/anycast.c,v retrieving revision 1.1.1.11 retrieving revision 1.1.1.11.2.1 diff -u -r1.1.1.11 -r1.1.1.11.2.1 --- a/net/ipv6/anycast.c 25 Aug 2003 11:44:44 -0000 1.1.1.11 +++ b/net/ipv6/anycast.c 16 Apr 2004 13:16:24 -0000 1.1.1.11.2.1 @@ -95,7 +95,6 @@ return onlink; } - /* * socket join an anycast group */ @@ -109,8 +108,12 @@ int ishost = !ipv6_devconf.forwarding; int err = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; if (ipv6_addr_type(addr) & IPV6_ADDR_MULTICAST) return -EINVAL; + if (ipv6_chk_addr(addr, NULL)) + return -EINVAL; pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); if (pac == NULL) @@ -160,21 +163,12 @@ * For hosts, allow link-local or matching prefix anycasts. * This obviates the need for propagating anycast routes while * still allowing some non-router anycast participation. - * - * allow anyone to join anycasts that don't require a special route - * and can't be spoofs of unicast addresses (reserved anycast only) */ if (!ip6_onlink(addr, dev)) { if (ishost) err = -EADDRNOTAVAIL; - else if (!capable(CAP_NET_ADMIN)) - err = -EPERM; if (err) goto out_dev_put; - } else if (!(ipv6_addr_type(addr) & IPV6_ADDR_ANYCAST) && - !capable(CAP_NET_ADMIN)) { - err = -EPERM; - goto out_dev_put; } err = ipv6_dev_ac_inc(dev, addr); @@ -265,6 +259,13 @@ dev_put(dev); } +#if 0 +/* The function is not used, which is funny. Apparently, author + * supposed to use it to filter out datagrams inside udp/raw but forgot. + * + * It is OK, anycasts are not special comparing to delivery to unicasts. + */ + int inet6_ac_check(struct sock *sk, struct in6_addr *addr, int ifindex) { struct ipv6_ac_socklist *pac; @@ -285,6 +286,8 @@ return found; } +#endif + static void aca_put(struct ifacaddr6 *ac) { if (atomic_dec_and_test(&ac->aca_refcnt)) { @@ -346,7 +349,7 @@ idev->ac_list = aca; write_unlock_bh(&idev->lock); - ip6_rt_addr_add(&aca->aca_addr, dev); + ip6_rt_addr_add(&aca->aca_addr, dev, 1); addrconf_join_solict(dev, &aca->aca_addr); Index: net/ipv6/datagram.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/datagram.c,v retrieving revision 1.1.1.19 retrieving revision 1.1.1.19.2.1 diff -u -r1.1.1.19 -r1.1.1.19.2.1 --- a/net/ipv6/datagram.c 13 Jun 2003 14:51:39 -0000 1.1.1.19 +++ b/net/ipv6/datagram.c 16 Apr 2004 13:16:24 -0000 1.1.1.19.2.1 @@ -78,7 +78,7 @@ iph = (struct ipv6hdr*)skb_put(skb, sizeof(struct ipv6hdr)); skb->nh.ipv6h = iph; - memcpy(&iph->daddr, fl->fl6_dst, 16); + ipv6_addr_copy(&iph->daddr, &fl->fl6_dst); serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; @@ -89,7 +89,7 @@ serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw; - serr->port = fl->uli_u.ports.dport; + serr->port = fl->fl_ip_dport; skb->h.raw = skb->tail; __skb_pull(skb, skb->tail - skb->data); @@ -291,7 +291,8 @@ goto exit_f; } - fl->fl6_src = &src_info->ipi6_addr; + ipv6_addr_copy(&fl->fl6_src, + &src_info->ipi6_addr); } break; Index: net/ipv6/esp6.c =================================================================== RCS file: net/ipv6/esp6.c diff -N net/ipv6/esp6.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv6/esp6.c 16 Apr 2004 13:16:24 -0000 1.6.18.1 @@ -0,0 +1,500 @@ +/* + * Copyright (C)2002 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors + * + * Mitsuru KANDA @USAGI : IPv6 Support + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro + * + * This file is derived from net/ipv4/esp.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_SG_ONSTACK 4 + +/* XXX no ipv6 esp specific */ +#define NIP6(addr) \ + ntohs((addr).s6_addr16[0]),\ + ntohs((addr).s6_addr16[1]),\ + ntohs((addr).s6_addr16[2]),\ + ntohs((addr).s6_addr16[3]),\ + ntohs((addr).s6_addr16[4]),\ + ntohs((addr).s6_addr16[5]),\ + ntohs((addr).s6_addr16[6]),\ + ntohs((addr).s6_addr16[7]) + +int esp6_output(struct sk_buff *skb) +{ + int err; + int hdr_len = 0; + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct ipv6hdr *iph = NULL, *top_iph; + struct ipv6_esp_hdr *esph; + struct crypto_tfm *tfm; + struct esp_data *esp; + struct sk_buff *trailer; + int blksize; + int clen; + int alen; + int nfrags; + u8 *prevhdr; + u8 nexthdr = 0; + + /* First, if the skb is not checksummed, complete checksum. */ + if (skb->ip_summed == CHECKSUM_HW && skb_checksum_help(skb) == NULL) { + err = -EINVAL; + goto error_nolock; + } + + spin_lock_bh(&x->lock); + err = xfrm_check_output(x, skb, AF_INET6); + if (err) + goto error; + err = -ENOMEM; + + /* Strip IP header in transport mode. Save it. */ + + if (!x->props.mode) { + hdr_len = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + *prevhdr = IPPROTO_ESP; + iph = kmalloc(hdr_len, GFP_ATOMIC); + if (!iph) { + err = -ENOMEM; + goto error; + } + memcpy(iph, skb->nh.raw, hdr_len); + __skb_pull(skb, hdr_len); + } + + /* Now skb is pure payload to encrypt */ + + /* Round to block size */ + clen = skb->len; + + esp = x->data; + alen = esp->auth.icv_trunc_len; + tfm = esp->conf.tfm; + blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3; + clen = (clen + 2 + blksize-1)&~(blksize-1); + if (esp->conf.padlen) + clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1); + + if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0) { + if (!x->props.mode && iph) kfree(iph); + goto error; + } + + /* Fill padding... */ + do { + int i; + for (i=0; ilen - 2; i++) + *(u8*)(trailer->tail + i) = i+1; + } while (0); + *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2; + pskb_put(skb, trailer, clen - skb->len); + + if (x->props.mode) { + iph = skb->nh.ipv6h; + top_iph = (struct ipv6hdr*)skb_push(skb, x->props.header_len); + esph = (struct ipv6_esp_hdr*)(top_iph+1); + *(u8*)(trailer->tail - 1) = IPPROTO_IPV6; + top_iph->version = 6; + top_iph->priority = iph->priority; + top_iph->flow_lbl[0] = iph->flow_lbl[0]; + top_iph->flow_lbl[1] = iph->flow_lbl[1]; + top_iph->flow_lbl[2] = iph->flow_lbl[2]; + if (x->props.flags & XFRM_STATE_NOECN) + IP6_ECN_clear(top_iph); + top_iph->nexthdr = IPPROTO_ESP; + top_iph->payload_len = htons(skb->len + alen - sizeof(struct ipv6hdr)); + top_iph->hop_limit = iph->hop_limit; + memcpy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr, sizeof(struct in6_addr)); + memcpy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr, sizeof(struct in6_addr)); + } else { + esph = (struct ipv6_esp_hdr*)skb_push(skb, x->props.header_len); + skb->h.raw = (unsigned char*)esph; + top_iph = (struct ipv6hdr*)skb_push(skb, hdr_len); + memcpy(top_iph, iph, hdr_len); + kfree(iph); + top_iph->payload_len = htons(skb->len + alen - sizeof(struct ipv6hdr)); + *(u8*)(trailer->tail - 1) = nexthdr; + } + + esph->spi = x->id.spi; + esph->seq_no = htonl(++x->replay.oseq); + + if (esp->conf.ivlen) + crypto_cipher_set_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + + do { + struct scatterlist sgbuf[nfrags>MAX_SG_ONSTACK ? 0 : nfrags]; + struct scatterlist *sg = sgbuf; + + if (unlikely(nfrags > MAX_SG_ONSTACK)) { + sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); + if (!sg) + goto error; + } + skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen); + crypto_cipher_encrypt(tfm, sg, sg, clen); + if (unlikely(sg != sgbuf)) + kfree(sg); + } while (0); + + if (esp->conf.ivlen) { + memcpy(esph->enc_data, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + crypto_cipher_get_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + } + + if (esp->auth.icv_full_len) { + esp->auth.icv(esp, skb, (u8*)esph-skb->data, + sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen+clen, trailer->tail); + pskb_put(skb, trailer, alen); + } + + skb->nh.raw = skb->data; + + x->curlft.bytes += skb->len; + x->curlft.packets++; + spin_unlock_bh(&x->lock); + if ((skb->dst = dst_pop(dst)) == NULL) { + err = -EHOSTUNREACH; + goto error_nolock; + } + return NET_XMIT_BYPASS; + +error: + spin_unlock_bh(&x->lock); +error_nolock: + kfree_skb(skb); + return err; +} + +int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + struct ipv6hdr *iph; + struct ipv6_esp_hdr *esph; + struct esp_data *esp = x->data; + struct sk_buff *trailer; + int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm); + int alen = esp->auth.icv_trunc_len; + int elen = skb->len - sizeof(struct ipv6_esp_hdr) - esp->conf.ivlen - alen; + + int hdr_len = skb->h.raw - skb->nh.raw; + int nfrags; + unsigned char *tmp_hdr = NULL; + int ret = 0; + + if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) { + ret = -EINVAL; + goto out_nofree; + } + + if (elen <= 0 || (elen & (blksize-1))) { + ret = -EINVAL; + goto out_nofree; + } + + tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); + if (!tmp_hdr) { + ret = -ENOMEM; + goto out_nofree; + } + memcpy(tmp_hdr, skb->nh.raw, hdr_len); + + /* If integrity check is required, do this. */ + if (esp->auth.icv_full_len) { + u8 sum[esp->auth.icv_full_len]; + u8 sum1[alen]; + + esp->auth.icv(esp, skb, 0, skb->len-alen, sum); + + if (skb_copy_bits(skb, skb->len-alen, sum1, alen)) + BUG(); + + if (unlikely(memcmp(sum, sum1, alen))) { + x->stats.integrity_failed++; + ret = -EINVAL; + goto out; + } + } + + if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) { + ret = -EINVAL; + goto out; + } + + skb->ip_summed = CHECKSUM_NONE; + + esph = (struct ipv6_esp_hdr*)skb->data; + iph = skb->nh.ipv6h; + + /* Get ivec. This can be wrong, check against another impls. */ + if (esp->conf.ivlen) + crypto_cipher_set_iv(esp->conf.tfm, esph->enc_data, crypto_tfm_alg_ivsize(esp->conf.tfm)); + + { + u8 nexthdr[2]; + struct scatterlist sgbuf[nfrags>MAX_SG_ONSTACK ? 0 : nfrags]; + struct scatterlist *sg = sgbuf; + u8 padlen; + u8 *prevhdr; + + if (unlikely(nfrags > MAX_SG_ONSTACK)) { + sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); + if (!sg) { + ret = -ENOMEM; + goto out; + } + } + skb_to_sgvec(skb, sg, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen, elen); + crypto_cipher_decrypt(esp->conf.tfm, sg, sg, elen); + if (unlikely(sg != sgbuf)) + kfree(sg); + + if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2)) + BUG(); + + padlen = nexthdr[0]; + if (padlen+2 >= elen) { + if (net_ratelimit()) { + printk(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen); + } + ret = -EINVAL; + goto out; + } + /* ... check padding bits here. Silly. :-) */ + + pskb_trim(skb, skb->len - alen - padlen - 2); + skb->h.raw = skb_pull(skb, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen); + skb->nh.raw += sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen; + memcpy(skb->nh.raw, tmp_hdr, hdr_len); + skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + ip6_find_1stfragopt(skb, &prevhdr); + ret = *prevhdr = nexthdr[1]; + } + +out: + kfree(tmp_hdr); +out_nofree: + return ret; +} + +static u32 esp6_get_max_size(struct xfrm_state *x, int mtu) +{ + struct esp_data *esp = x->data; + u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm); + + if (x->props.mode) { + mtu = (mtu + 2 + blksize-1)&~(blksize-1); + } else { + /* The worst case. */ + mtu += 2 + blksize; + } + if (esp->conf.padlen) + mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1); + + return mtu + x->props.header_len + esp->auth.icv_full_len; +} + +void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + struct ipv6_esp_hdr *esph = (struct ipv6_esp_hdr*)(skb->data+offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH || + type != ICMPV6_PKT_TOOBIG) + return; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET6); + if (!x) + return; + printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/" + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + ntohl(esph->spi), NIP6(iph->daddr)); + xfrm_state_put(x); +} + +void esp6_destroy(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + + if (!esp) + return; + + if (esp->conf.tfm) { + crypto_free_tfm(esp->conf.tfm); + esp->conf.tfm = NULL; + } + if (esp->conf.ivec) { + kfree(esp->conf.ivec); + esp->conf.ivec = NULL; + } + if (esp->auth.tfm) { + crypto_free_tfm(esp->auth.tfm); + esp->auth.tfm = NULL; + } + if (esp->auth.work_icv) { + kfree(esp->auth.work_icv); + esp->auth.work_icv = NULL; + } + kfree(esp); +} + +int esp6_init_state(struct xfrm_state *x, void *args) +{ + struct esp_data *esp = NULL; + + if (x->aalg) { + if (x->aalg->alg_key_len == 0 || x->aalg->alg_key_len > 512) + goto error; + } + if (x->ealg == NULL) + goto error; + + esp = kmalloc(sizeof(*esp), GFP_KERNEL); + if (esp == NULL) + return -ENOMEM; + + memset(esp, 0, sizeof(*esp)); + + if (x->aalg) { + struct xfrm_algo_desc *aalg_desc; + + esp->auth.key = x->aalg->alg_key; + esp->auth.key_len = (x->aalg->alg_key_len+7)/8; + esp->auth.tfm = crypto_alloc_tfm(x->aalg->alg_name, 0); + if (esp->auth.tfm == NULL) + goto error; + esp->auth.icv = esp_hmac_digest; + + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_tfm_alg_digestsize(esp->auth.tfm)) { + printk(KERN_INFO "ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_tfm_alg_digestsize(esp->auth.tfm), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto error; + } + + esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + + esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL); + if (!esp->auth.work_icv) + goto error; + } + esp->conf.key = x->ealg->alg_key; + esp->conf.key_len = (x->ealg->alg_key_len+7)/8; + if (x->props.ealgo == SADB_EALG_NULL) + esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_ECB); + else + esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_CBC); + if (esp->conf.tfm == NULL) + goto error; + esp->conf.ivlen = crypto_tfm_alg_ivsize(esp->conf.tfm); + esp->conf.padlen = 0; + if (esp->conf.ivlen) { + esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL); + get_random_bytes(esp->conf.ivec, esp->conf.ivlen); + } + crypto_cipher_setkey(esp->conf.tfm, esp->conf.key, esp->conf.key_len); + x->props.header_len = sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen; + if (x->props.mode) + x->props.header_len += sizeof(struct ipv6hdr); + x->data = esp; + return 0; + +error: + if (esp) { + if (esp->auth.tfm) + crypto_free_tfm(esp->auth.tfm); + if (esp->auth.work_icv) + kfree(esp->auth.work_icv); + if (esp->conf.tfm) + crypto_free_tfm(esp->conf.tfm); + kfree(esp); + } + return -EINVAL; +} + +static struct xfrm_type esp6_type = +{ + .description = "ESP6", + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, + .init_state = esp6_init_state, + .destructor = esp6_destroy, + .get_max_size = esp6_get_max_size, + .input = esp6_input, + .output = esp6_output +}; + +static struct inet6_protocol esp6_protocol = { + .handler = xfrm6_rcv, + .err_handler = esp6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +int __init esp6_init(void) +{ + if (xfrm_register_type(&esp6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipv6 esp init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&esp6_protocol, IPPROTO_ESP) < 0) { + printk(KERN_INFO "ipv6 esp init: can't add protocol\n"); + xfrm_unregister_type(&esp6_type, AF_INET6); + return -EAGAIN; + } + + return 0; +} + +static void __exit esp6_fini(void) +{ + if (inet6_del_protocol(&esp6_protocol, IPPROTO_ESP) < 0) + printk(KERN_INFO "ipv6 esp close: can't remove protocol\n"); + if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 esp close: can't remove xfrm type\n"); +} + +module_init(esp6_init); +module_exit(esp6_fini); + +MODULE_LICENSE("GPL"); Index: net/ipv6/exthdrs.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/exthdrs.c,v retrieving revision 1.1.1.17 retrieving revision 1.1.1.17.2.1 diff -u -r1.1.1.17 -r1.1.1.17.2.1 --- a/net/ipv6/exthdrs.c 14 Apr 2004 13:05:41 -0000 1.1.1.17 +++ b/net/ipv6/exthdrs.c 16 Apr 2004 13:16:24 -0000 1.1.1.17.2.1 @@ -18,6 +18,9 @@ /* Changes: * yoshfuji : ensure not to overrun while parsing * tlv options. + * Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs(). + * YOSHIFUJI Hideaki @USAGI Register inbound extention header + * handlers as inet6_protocol{}. */ #include @@ -44,20 +47,6 @@ #include /* - * Parsing inbound headers. - * - * Parsing function "func" returns offset wrt skb->nh of the place, - * where next nexthdr value is stored or NULL, if parsing - * failed. It should also update skb->h tp point at the next header. - */ - -struct hdrtype_proc -{ - int type; - int (*func) (struct sk_buff **, int offset); -}; - -/* * Parsing tlv encoded headers. * * Parsing function "func" returns 1, if parsing succeed @@ -164,9 +153,9 @@ {-1, NULL} }; -static int ipv6_dest_opt(struct sk_buff **skb_ptr, int nhoff) +static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp) { - struct sk_buff *skb=*skb_ptr; + struct sk_buff *skb = *skbp; struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+8) || @@ -179,29 +168,56 @@ if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) { skb->h.raw += ((skb->h.raw[1]+1)<<3); - return opt->dst1; + *nhoffp = opt->dst1; + return 1; } return -1; } +static struct inet6_protocol destopt_protocol = +{ + .handler = ipv6_destopt_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + +void __init ipv6_destopt_init(void) +{ + if (inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS) < 0) + printk(KERN_ERR "ipv6_destopt_init: Could not register protocol\n"); +} + /******************************** NONE header. No data in packet. ********************************/ -static int ipv6_nodata(struct sk_buff **skb_ptr, int nhoff) +static int ipv6_nodata_rcv(struct sk_buff **skbp, unsigned int *nhoffp) { - kfree_skb(*skb_ptr); - return -1; + struct sk_buff *skb = *skbp; + + kfree_skb(skb); + return 0; +} + +static struct inet6_protocol nodata_protocol = +{ + .handler = ipv6_nodata_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + +void __init ipv6_nodata_init(void) +{ + if (inet6_add_protocol(&nodata_protocol, IPPROTO_NONE) < 0) + printk(KERN_ERR "ipv6_nodata_init: Could not register protocol\n"); } /******************************** Routing header. ********************************/ -static int ipv6_routing_header(struct sk_buff **skb_ptr, int nhoff) +static int ipv6_rthdr_rcv(struct sk_buff **skbp, unsigned int *nhoffp) { - struct sk_buff *skb = *skb_ptr; + struct sk_buff *skb = *skbp; struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; struct in6_addr *addr; struct in6_addr daddr; @@ -232,7 +248,8 @@ skb->h.raw += (hdr->hdrlen + 1) << 3; opt->dst0 = opt->dst1; opt->dst1 = 0; - return (&hdr->nexthdr) - skb->nh.raw; + *nhoffp = (&hdr->nexthdr) - skb->nh.raw; + return 1; } if (hdr->type != IPV6_SRCRT_TYPE_0) { @@ -247,7 +264,7 @@ /* * This is the routing header forwarding algorithm from - * RFC 1883, page 17. + * RFC 2460, page 16. */ n = hdr->hdrlen >> 1; @@ -265,7 +282,7 @@ kfree_skb(skb); if (skb2 == NULL) return -1; - *skb_ptr = skb = skb2; + *skbp = skb = skb2; opt = (struct inet6_skb_parm *)skb2->cb; hdr = (struct ipv6_rt_hdr *) skb2->h.raw; } @@ -293,7 +310,7 @@ dst_release(xchg(&skb->dst, NULL)); ip6_route_input(skb); if (skb->dst->error) { - skb->dst->input(skb); + dst_input(skb); return -1; } if (skb->dst->dev->flags&IFF_LOOPBACK) { @@ -307,10 +324,22 @@ goto looped_back; } - skb->dst->input(skb); + dst_input(skb); return -1; } +static struct inet6_protocol rthdr_protocol = +{ + .handler = ipv6_rthdr_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + +void __init ipv6_rthdr_init(void) +{ + if (inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING) < 0) + printk(KERN_ERR "ipv6_rthdr_init: Could not register protocol\n"); +}; + /* This function inverts received rthdr. NOTE: specs allow to make it automatically only if @@ -376,97 +405,6 @@ return opt; } -/******************************** - AUTH header. - ********************************/ - -/* - rfc1826 said, that if a host does not implement AUTH header - it MAY ignore it. We use this hole 8) - - Actually, now we can implement OSPFv6 without kernel IPsec. - Authentication for poors may be done in user space with the same success. - - Yes, it means, that we allow application to send/receive - raw authentication header. Apparently, we suppose, that it knows - what it does and calculates authentication data correctly. - Certainly, it is possible only for udp and raw sockets, but not for tcp. - - AUTH header has 4byte granular length, which kills all the idea - behind AUTOMATIC 64bit alignment of IPv6. Now we will lose - cpu ticks, checking that sender did not something stupid - and opt->hdrlen is even. Shit! --ANK (980730) - */ - -static int ipv6_auth_hdr(struct sk_buff **skb_ptr, int nhoff) -{ - struct sk_buff *skb=*skb_ptr; - struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; - int len; - - if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+8)) - goto fail; - - /* - * RFC2402 2.2 Payload Length - * The 8-bit field specifies the length of AH in 32-bit words - * (4-byte units), minus "2". - * -- Noriaki Takamiya @USAGI Project - */ - len = (skb->h.raw[1]+2)<<2; - - if (len&7) - goto fail; - - if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+len)) - goto fail; - - opt->auth = skb->h.raw - skb->nh.raw; - skb->h.raw += len; - return opt->auth; - -fail: - kfree_skb(skb); - return -1; -} - -/* This list MUST NOT contain entry for NEXTHDR_HOP. - It is parsed immediately after packet received - and if it occurs somewhere in another place we must - generate error. - */ - -struct hdrtype_proc hdrproc_lst[] = { - {NEXTHDR_FRAGMENT, ipv6_reassembly}, - {NEXTHDR_ROUTING, ipv6_routing_header}, - {NEXTHDR_DEST, ipv6_dest_opt}, - {NEXTHDR_NONE, ipv6_nodata}, - {NEXTHDR_AUTH, ipv6_auth_hdr}, - /* - {NEXTHDR_ESP, ipv6_esp_hdr}, - */ - {-1, NULL} -}; - -int ipv6_parse_exthdrs(struct sk_buff **skb_in, int nhoff) -{ - struct hdrtype_proc *hdrt; - u8 nexthdr = (*skb_in)->nh.raw[nhoff]; - -restart: - for (hdrt=hdrproc_lst; hdrt->type >= 0; hdrt++) { - if (hdrt->type == nexthdr) { - if ((nhoff = hdrt->func(skb_in, nhoff)) >= 0) { - nexthdr = (*skb_in)->nh.raw[nhoff]; - goto restart; - } - return -1; - } - } - return nhoff; -} - - /********************************** Hop-by-hop options. **********************************/ @@ -498,7 +436,7 @@ } pkt_len = ntohl(*(u32*)(skb->nh.raw+optoff+2)); - if (pkt_len < 0x10000) { + if (pkt_len <= IPV6_MAXPLEN) { icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); return 0; } Index: net/ipv6/icmp.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/icmp.c,v retrieving revision 1.1.1.24 retrieving revision 1.1.1.24.2.1 diff -u -r1.1.1.24 -r1.1.1.24.2.1 --- a/net/ipv6/icmp.c 14 Apr 2004 13:05:41 -0000 1.1.1.24 +++ b/net/ipv6/icmp.c 16 Apr 2004 13:16:24 -0000 1.1.1.24.2.1 @@ -26,6 +26,7 @@ * yoshfuji : ensure to sent parameter problem for * fragments. * YOSHIFUJI Hideaki @USAGI: added sysctl for icmp rate limit. + * Kazunori MIYAZAWA @USAGI: change output process to use ip6_append_data */ #include @@ -74,17 +75,11 @@ #define icmpv6_socket __icmpv6_socket[smp_processor_id()] #define icmpv6_socket_cpu(X) __icmpv6_socket[(X)] -int icmpv6_rcv(struct sk_buff *skb); +static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp); -static struct inet6_protocol icmpv6_protocol = -{ - icmpv6_rcv, /* handler */ - NULL, /* error control */ - NULL, /* next */ - IPPROTO_ICMPV6, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "ICMPv6" /* name */ +static struct inet6_protocol icmpv6_protocol = { + .handler = icmpv6_rcv, + .flags = INET6_PROTO_FINAL, }; struct icmpv6_msg { @@ -116,40 +111,6 @@ spin_unlock_bh(&icmpv6_socket->sk->lock.slock); } -/* - * getfrag callback - */ - -static int icmpv6_getfrag(const void *data, struct in6_addr *saddr, - char *buff, unsigned int offset, unsigned int len) -{ - struct icmpv6_msg *msg = (struct icmpv6_msg *) data; - struct icmp6hdr *icmph; - __u32 csum; - - if (offset) { - csum = skb_copy_and_csum_bits(msg->skb, msg->offset + - (offset - sizeof(struct icmp6hdr)), - buff, len, msg->csum); - msg->csum = csum; - return 0; - } - - csum = csum_partial_copy_nocheck((void *) &msg->icmph, buff, - sizeof(struct icmp6hdr), msg->csum); - - csum = skb_copy_and_csum_bits(msg->skb, msg->offset, - buff + sizeof(struct icmp6hdr), - len - sizeof(struct icmp6hdr), csum); - - icmph = (struct icmp6hdr *) buff; - - icmph->icmp6_cksum = csum_ipv6_magic(saddr, msg->daddr, msg->len, - IPPROTO_ICMPV6, csum); - return 0; -} - - /* * Slightly more convenient version of icmpv6_send. */ @@ -252,21 +213,74 @@ return (optval&0xC0) == 0x80; } +int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct icmp6hdr *thdr, int len) +{ + struct sk_buff *skb; + struct icmp6hdr *icmp6h; + int err = 0; + + if ((skb = skb_peek(&sk->write_queue)) == NULL) + goto out; + + icmp6h = (struct icmp6hdr*) skb->h.raw; + memcpy(icmp6h, thdr, sizeof(struct icmp6hdr)); + icmp6h->icmp6_cksum = 0; + + if (skb_queue_len(&sk->write_queue) == 1) { + skb->csum = csum_partial((char *)icmp6h, + sizeof(struct icmp6hdr), skb->csum); + icmp6h->icmp6_cksum = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + len, fl->proto, + skb->csum); + } else { + u32 tmp_csum = 0; + + skb_queue_walk(&sk->write_queue, skb) { + tmp_csum = csum_add(tmp_csum, skb->csum); + } + + tmp_csum = csum_partial((char *)icmp6h, + sizeof(struct icmp6hdr), tmp_csum); + tmp_csum = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + len, fl->proto, tmp_csum); + icmp6h->icmp6_cksum = tmp_csum; + } + if (icmp6h->icmp6_cksum == 0) + icmp6h->icmp6_cksum = -1; + ip6_push_pending_frames(sk); +out: + return err; +} + +static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) +{ + struct sk_buff *org_skb = (struct sk_buff *)from; + __u32 csum = 0; + csum = skb_copy_and_csum_bits(org_skb, offset, to, len, csum); + skb->csum = csum_block_add(skb->csum, csum, odd); + return 0; +} + /* * Send an ICMP message in response to a packet in error */ - void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, struct net_device *dev) { struct ipv6hdr *hdr = skb->nh.ipv6h; struct sock *sk = icmpv6_socket->sk; + struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *saddr = NULL; - int iif = 0; - struct icmpv6_msg msg; + struct dst_entry *dst; + struct icmp6hdr tmp_hdr; struct flowi fl; + int iif = 0; int addr_type = 0; - int len; + int len, plen; + int hlimit = -1; + int err = 0; if ((u8*)hdr < skb->head || (u8*)(hdr+1) > skb->tail) return; @@ -324,13 +338,14 @@ return; } + memset(&fl, 0, sizeof(fl)); fl.proto = IPPROTO_ICMPV6; - fl.nl_u.ip6_u.daddr = &hdr->saddr; - fl.nl_u.ip6_u.saddr = saddr; + ipv6_addr_copy(&fl.fl6_dst, &hdr->saddr); + if (saddr) + ipv6_addr_copy(&fl.fl6_src, saddr); fl.oif = iif; - fl.fl6_flowlabel = 0; - fl.uli_u.icmpt.type = type; - fl.uli_u.icmpt.code = code; + fl.fl_icmp_type = type; + fl.fl_icmp_code = code; if (icmpv6_xmit_lock()) return; @@ -338,37 +353,52 @@ if (!icmpv6_xrlim_allow(sk, type, &fl)) goto out; - /* - * ok. kick it. checksum will be provided by the - * getfrag_t callback. - */ + tmp_hdr.icmp6_type = type; + tmp_hdr.icmp6_code = code; + tmp_hdr.icmp6_cksum = 0; + tmp_hdr.icmp6_pointer = htonl(info); + + if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) + fl.oif = np->mcast_oif; - msg.icmph.icmp6_type = type; - msg.icmph.icmp6_code = code; - msg.icmph.icmp6_cksum = 0; - msg.icmph.icmp6_pointer = htonl(info); - - msg.skb = skb; - msg.offset = skb->nh.raw - skb->data; - msg.csum = 0; - msg.daddr = &hdr->saddr; + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto out; - len = skb->len - msg.offset + sizeof(struct icmp6hdr); - len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr)); + if (hlimit < 0) { + if (ipv6_addr_is_multicast(&fl.fl6_dst)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + } + plen = skb->nh.raw - skb->data; + __skb_pull(skb, plen); + len = skb->len; + len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr)); if (len < 0) { if (net_ratelimit()) printk(KERN_DEBUG "icmp: len problem\n"); - goto out; + __skb_push(skb, plen); + goto out_dst_release; } - msg.len = len; + err = ip6_append_data(sk, icmpv6_getfrag, skb, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), + hlimit, NULL, &fl, (struct rt6_info*)dst, MSG_DONTWAIT); + if (err) { + ip6_flush_pending_frames(sk); + goto out_dst_release; + } + err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, len + sizeof(struct icmp6hdr)); + __skb_push(skb, plen); - ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, len, NULL, -1, - MSG_DONTWAIT); if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB) (&(icmpv6_statistics[smp_processor_id()*2].Icmp6OutDestUnreachs))[type-1]++; ICMP6_INC_STATS_BH(Icmp6OutMsgs); +out_dst_release: + dst_release(dst); out: icmpv6_xmit_unlock(); } @@ -376,45 +406,66 @@ static void icmpv6_echo_reply(struct sk_buff *skb) { struct sock *sk = icmpv6_socket->sk; + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *saddr = NULL; struct icmp6hdr *icmph = (struct icmp6hdr *) skb->h.raw; - struct in6_addr *saddr; - struct icmpv6_msg msg; + struct icmp6hdr tmp_hdr; struct flowi fl; + struct dst_entry *dst; + int err = 0; + int hlimit = -1; saddr = &skb->nh.ipv6h->daddr; - if (ipv6_addr_type(saddr) & IPV6_ADDR_MULTICAST || - ipv6_chk_acast_addr(0, saddr)) + if (!ipv6_unicast_destination(skb)) saddr = NULL; - msg.icmph.icmp6_type = ICMPV6_ECHO_REPLY; - msg.icmph.icmp6_code = 0; - msg.icmph.icmp6_cksum = 0; - msg.icmph.icmp6_identifier = icmph->icmp6_identifier; - msg.icmph.icmp6_sequence = icmph->icmp6_sequence; - - msg.skb = skb; - msg.offset = 0; - msg.csum = 0; - msg.len = skb->len + sizeof(struct icmp6hdr); - msg.daddr = &skb->nh.ipv6h->saddr; + memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); + tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY; + memset(&fl, 0, sizeof(fl)); fl.proto = IPPROTO_ICMPV6; - fl.nl_u.ip6_u.daddr = msg.daddr; - fl.nl_u.ip6_u.saddr = saddr; + ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr); + if (saddr) + ipv6_addr_copy(&fl.fl6_src, saddr); fl.oif = skb->dev->ifindex; - fl.fl6_flowlabel = 0; - fl.uli_u.icmpt.type = ICMPV6_ECHO_REPLY; - fl.uli_u.icmpt.code = 0; + fl.fl_icmp_type = ICMPV6_ECHO_REPLY; if (icmpv6_xmit_lock()) return; - ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, msg.len, NULL, -1, - MSG_DONTWAIT); - ICMP6_INC_STATS_BH(Icmp6OutEchoReplies); - ICMP6_INC_STATS_BH(Icmp6OutMsgs); + if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) + fl.oif = np->mcast_oif; + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto out; + if (hlimit < 0) { + if (ipv6_addr_is_multicast(&fl.fl6_dst)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + } + + err = ip6_append_data(sk, icmpv6_getfrag, skb, skb->len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), hlimit, NULL, &fl, + (struct rt6_info*)dst, MSG_DONTWAIT); + + if (err) { + ip6_flush_pending_frames(sk); + goto out_dst_release; + } + err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); + + ICMP6_INC_STATS_BH(Icmp6OutEchoReplies); + ICMP6_INC_STATS_BH(Icmp6OutMsgs); + +out_dst_release: + dst_release(dst); +out: icmpv6_xmit_unlock(); } @@ -456,15 +507,9 @@ hash = nexthdr & (MAX_INET_PROTOS - 1); - for (ipprot = (struct inet6_protocol *) inet6_protos[hash]; - ipprot != NULL; - ipprot=(struct inet6_protocol *)ipprot->next) { - if (ipprot->protocol != nexthdr) - continue; - - if (ipprot->err_handler) - ipprot->err_handler(skb, NULL, type, code, inner_offset, info); - } + ipprot = inet6_protos[hash]; + if (ipprot && ipprot->err_handler) + ipprot->err_handler(skb, NULL, type, code, inner_offset, info); read_lock(&raw_v6_lock); if ((sk = raw_v6_htable[hash]) != NULL) { @@ -480,8 +525,9 @@ * Handle icmp messages */ -int icmpv6_rcv(struct sk_buff *skb) +static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) { + struct sk_buff *skb = *pskb; struct net_device *dev = skb->dev; struct in6_addr *saddr, *daddr; struct ipv6hdr *orig_hdr; @@ -658,7 +704,12 @@ sk->prot->unhash(sk); } - inet6_add_protocol(&icmpv6_protocol); + if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) { + printk(KERN_ERR "Failed to register ICMP6 protocol\n"); + sock_release(icmpv6_socket); + icmpv6_socket = NULL; + return -EAGAIN; + } return 0; fail: @@ -677,7 +728,7 @@ sock_release(icmpv6_socket_cpu(i)); icmpv6_socket_cpu(i) = NULL; } - inet6_del_protocol(&icmpv6_protocol); + inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); } static struct icmp6_err { Index: net/ipv6/ip6_fib.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/ip6_fib.c,v retrieving revision 1.1.1.21 retrieving revision 1.1.1.21.2.1 diff -u -r1.1.1.21 -r1.1.1.21.2.1 --- a/net/ipv6/ip6_fib.c 25 Aug 2003 11:44:44 -0000 1.1.1.21 +++ b/net/ipv6/ip6_fib.c 16 Apr 2004 13:16:24 -0000 1.1.1.21.2.1 @@ -40,7 +40,6 @@ #include #define RT6_DEBUG 2 -#undef CONFIG_IPV6_SUBTREES #if RT6_DEBUG >= 3 #define RT6_TRACE(x...) printk(KERN_DEBUG x) @@ -453,7 +452,6 @@ */ if ((iter->rt6i_dev == rt->rt6i_dev) && - (iter->rt6i_flowr == rt->rt6i_flowr) && (ipv6_addr_cmp(&iter->rt6i_gateway, &rt->rt6i_gateway) == 0)) { if (!(iter->rt6i_flags&RTF_EXPIRES)) @@ -500,13 +498,19 @@ mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); } +void fib6_force_start_gc(void) +{ + if (ip6_fib_timer.expires == 0) + mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); +} + /* * Add routing information to the routing tree. * / * with source addr info in sub-trees */ -int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh) +int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) { struct fib6_node *fn; int err = -ENOMEM; @@ -597,8 +601,8 @@ is orphan. If it is, shoot it. */ st_failure: - if (fn && !(fn->fn_flags&RTN_RTINFO|RTN_ROOT)) - fib_repair_tree(fn); + if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) + fib6_repair_tree(fn); dst_free(&rt->u.dst); return err; #endif @@ -888,7 +892,7 @@ } static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, - struct nlmsghdr *nlh) + struct nlmsghdr *nlh, void *_rtattr) { struct fib6_walker_t *w; struct rt6_info *rt = *rtp; @@ -947,7 +951,7 @@ rt6_release(rt); } -int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh) +int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) { struct fib6_node *fn = rt->rt6i_node; struct rt6_info **rtp; @@ -972,7 +976,7 @@ for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) { if (*rtp == rt) { - fib6_del_route(fn, rtp, nlh); + fib6_del_route(fn, rtp, nlh, _rtattr); return 0; } } @@ -1101,7 +1105,7 @@ res = c->func(rt, c->arg); if (res < 0) { w->leaf = rt; - res = fib6_del(rt, NULL); + res = fib6_del(rt, NULL, NULL); if (res) { #if RT6_DEBUG >= 2 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); @@ -1218,6 +1222,7 @@ write_lock_bh(&rt6_lock); + ndisc_dst_gc(&gc_args.more); fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL); write_unlock_bh(&rt6_lock); @@ -1232,17 +1237,17 @@ void __init fib6_init(void) { - if (!fib6_node_kmem) - fib6_node_kmem = kmem_cache_create("fib6_nodes", - sizeof(struct fib6_node), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); + fib6_node_kmem = kmem_cache_create("fib6_nodes", + sizeof(struct fib6_node), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); } #ifdef MODULE void fib6_gc_cleanup(void) { del_timer(&ip6_fib_timer); + kmem_cache_destroy(fib6_node_kmem); } #endif Index: net/ipv6/ip6_fw.c =================================================================== RCS file: net/ipv6/ip6_fw.c diff -N net/ipv6/ip6_fw.c --- a/net/ipv6/ip6_fw.c 21 Dec 2001 17:42:05 -0000 1.1.1.4 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,390 +0,0 @@ -/* - * IPv6 Firewall - * Linux INET6 implementation - * - * Authors: - * Pedro Roque - * - * $Id: ip6_fw.c,v 1.16 2001/10/31 08:17:58 davem Exp $ - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -static unsigned long ip6_fw_rule_cnt; -static struct ip6_fw_rule ip6_fw_rule_list = { - {0}, - NULL, NULL, - {0}, - IP6_FW_REJECT -}; - -static int ip6_fw_accept(struct dst_entry *dst, struct fl_acc_args *args); - -struct flow_rule_ops ip6_fw_ops = { - ip6_fw_accept -}; - - -static struct rt6_info ip6_fw_null_entry = { - {{NULL, 0, 0, NULL, - 0, 0, 0, 0, 0, 0, 0, 0, -ENETUNREACH, NULL, NULL, - ip6_pkt_discard, ip6_pkt_discard, NULL}}, - NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0UL, - 0, &ip6_fw_rule_list, {{{{0}}}, 128}, {{{{0}}}, 128} -}; - -static struct fib6_node ip6_fw_fib = { - NULL, NULL, NULL, NULL, - &ip6_fw_null_entry, - 0, RTN_ROOT|RTN_TL_ROOT, 0 -}; - -rwlock_t ip6_fw_lock = RW_LOCK_UNLOCKED; - - -static void ip6_rule_add(struct ip6_fw_rule *rl) -{ - struct ip6_fw_rule *next; - - write_lock_bh(&ip6_fw_lock); - ip6_fw_rule_cnt++; - next = &ip6_fw_rule_list; - rl->next = next; - rl->prev = next->prev; - rl->prev->next = rl; - next->prev = rl; - write_unlock_bh(&ip6_fw_lock); -} - -static void ip6_rule_del(struct ip6_fw_rule *rl) -{ - struct ip6_fw_rule *next, *prev; - - write_lock_bh(&ip6_fw_lock); - ip6_fw_rule_cnt--; - next = rl->next; - prev = rl->prev; - next->prev = prev; - prev->next = next; - write_unlock_bh(&ip6_fw_lock); -} - -static __inline__ struct ip6_fw_rule * ip6_fwrule_alloc(void) -{ - struct ip6_fw_rule *rl; - - rl = kmalloc(sizeof(struct ip6_fw_rule), GFP_ATOMIC); - if (rl) - { - memset(rl, 0, sizeof(struct ip6_fw_rule)); - rl->flowr.ops = &ip6_fw_ops; - } - return rl; -} - -static __inline__ void ip6_fwrule_free(struct ip6_fw_rule * rl) -{ - kfree(rl); -} - -static __inline__ int port_match(int rl_port, int fl_port) -{ - int res = 0; - if (rl_port == 0 || (rl_port == fl_port)) - res = 1; - return res; -} - -static int ip6_fw_accept_trans(struct ip6_fw_rule *rl, - struct fl_acc_args *args) -{ - int res = FLOWR_NODECISION; - int proto = 0; - int sport = 0; - int dport = 0; - - switch (args->type) { - case FL_ARG_FORWARD: - { - struct sk_buff *skb = args->fl_u.skb; - struct ipv6hdr *hdr = skb->nh.ipv6h; - int len; - - len = skb->len - sizeof(struct ipv6hdr); - - proto = hdr->nexthdr; - - switch (proto) { - case IPPROTO_TCP: - { - struct tcphdr *th; - - if (len < sizeof(struct tcphdr)) { - res = FLOWR_ERROR; - goto out; - } - th = (struct tcphdr *)(hdr + 1); - sport = th->source; - dport = th->dest; - break; - } - case IPPROTO_UDP: - { - struct udphdr *uh; - - if (len < sizeof(struct udphdr)) { - res = FLOWR_ERROR; - goto out; - } - uh = (struct udphdr *)(hdr + 1); - sport = uh->source; - dport = uh->dest; - break; - } - default: - goto out; - }; - break; - } - - case FL_ARG_ORIGIN: - { - proto = args->fl_u.fl_o.flow->proto; - - if (proto == IPPROTO_ICMPV6) { - goto out; - } else { - sport = args->fl_u.fl_o.flow->uli_u.ports.sport; - dport = args->fl_u.fl_o.flow->uli_u.ports.dport; - } - break; - } - - if (proto == rl->info.proto && - port_match(args->fl_u.fl_o.flow->uli_u.ports.sport, sport) && - port_match(args->fl_u.fl_o.flow->uli_u.ports.dport, dport)) { - if (rl->policy & IP6_FW_REJECT) - res = FLOWR_SELECT; - else - res = FLOWR_CLEAR; - } - - default: -#if IP6_FW_DEBUG >= 1 - printk(KERN_DEBUG "ip6_fw_accept: unknown arg type\n"); -#endif - goto out; - }; - -out: - return res; -} - -static int ip6_fw_accept(struct dst_entry *dst, struct fl_acc_args *args) -{ - struct rt6_info *rt; - struct ip6_fw_rule *rl; - int proto; - int res = FLOWR_NODECISION; - - rt = (struct rt6_info *) dst; - rl = (struct ip6_fw_rule *) rt->rt6i_flowr; - - proto = rl->info.proto; - - switch (proto) { - case 0: - if (rl->policy & IP6_FW_REJECT) - res = FLOWR_SELECT; - else - res = FLOWR_CLEAR; - break; - case IPPROTO_TCP: - case IPPROTO_UDP: - res = ip6_fw_accept_trans(rl, args); - break; - case IPPROTO_ICMPV6: - }; - - return res; -} - -static struct dst_entry * ip6_fw_dup(struct dst_entry *frule, - struct dst_entry *rt, - struct fl_acc_args *args) -{ - struct ip6_fw_rule *rl; - struct rt6_info *nrt; - struct rt6_info *frt; - - frt = (struct rt6_info *) frule; - - rl = (struct ip6_fw_rule *) frt->rt6i_flowr; - - nrt = ip6_rt_copy((struct rt6_info *) rt); - - if (nrt) { - nrt->u.dst.input = frule->input; - nrt->u.dst.output = frule->output; - - nrt->rt6i_flowr = flow_clone(frt->rt6i_flowr); - - nrt->rt6i_flags |= RTF_CACHE; - nrt->rt6i_tstamp = jiffies; - } - - return (struct dst_entry *) nrt; -} - -int ip6_fw_reject(struct sk_buff *skb) -{ -#if IP6_FW_DEBUG >= 1 - printk(KERN_DEBUG "packet rejected: \n"); -#endif - - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADM_PROHIBITED, 0, - skb->dev); - /* - * send it via netlink, as (rule, skb) - */ - - kfree_skb(skb); - return 0; -} - -int ip6_fw_discard(struct sk_buff *skb) -{ - printk(KERN_DEBUG "ip6_fw: BUG fw_reject called\n"); - kfree_skb(skb); - return 0; -} - -int ip6_fw_msg_add(struct ip6_fw_msg *msg) -{ - struct in6_rtmsg rtmsg; - struct ip6_fw_rule *rl; - struct rt6_info *rt; - int err; - - ipv6_addr_copy(&rtmsg.rtmsg_dst, &msg->dst); - ipv6_addr_copy(&rtmsg.rtmsg_src, &msg->src); - rtmsg.rtmsg_dst_len = msg->dst_len; - rtmsg.rtmsg_src_len = msg->src_len; - rtmsg.rtmsg_metric = IP6_RT_PRIO_FW; - - rl = ip6_fwrule_alloc(); - - if (rl == NULL) - return -ENOMEM; - - rl->policy = msg->policy; - rl->info.proto = msg->proto; - rl->info.uli_u.data = msg->u.data; - - rtmsg.rtmsg_flags = RTF_NONEXTHOP|RTF_POLICY; - err = ip6_route_add(&rtmsg); - - if (err) { - ip6_fwrule_free(rl); - return err; - } - - /* The rest will not work for now. --ABK (989725) */ - -#ifndef notdef - ip6_fwrule_free(rl); - return -EPERM; -#else - rt->u.dst.error = -EPERM; - - if (msg->policy == IP6_FW_ACCEPT) { - /* - * Accept rules are never selected - * (i.e. packets use normal forwarding) - */ - rt->u.dst.input = ip6_fw_discard; - rt->u.dst.output = ip6_fw_discard; - } else { - rt->u.dst.input = ip6_fw_reject; - rt->u.dst.output = ip6_fw_reject; - } - - ip6_rule_add(rl); - - rt->rt6i_flowr = flow_clone((struct flow_rule *)rl); - - return 0; -#endif -} - -static int ip6_fw_msgrcv(int unit, struct sk_buff *skb) -{ - int count = 0; - - while (skb->len) { - struct ip6_fw_msg *msg; - - if (skb->len < sizeof(struct ip6_fw_msg)) { - count = -EINVAL; - break; - } - - msg = (struct ip6_fw_msg *) skb->data; - skb_pull(skb, sizeof(struct ip6_fw_msg)); - count += sizeof(struct ip6_fw_msg); - - switch (msg->action) { - case IP6_FW_MSG_ADD: - ip6_fw_msg_add(msg); - break; - case IP6_FW_MSG_DEL: - break; - default: - return -EINVAL; - }; - } - - return count; -} - -static void ip6_fw_destroy(struct flow_rule *rl) -{ - ip6_fwrule_free((struct ip6_fw_rule *)rl); -} - -#ifdef MODULE -#define ip6_fw_init module_init -#endif - -void __init ip6_fw_init(void) -{ - netlink_attach(NETLINK_IP6_FW, ip6_fw_msgrcv); -} - -#ifdef MODULE -void cleanup_module(void) -{ - netlink_detach(NETLINK_IP6_FW); -} -#endif Index: net/ipv6/ip6_input.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/ip6_input.c,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/net/ipv6/ip6_input.c 25 Aug 2003 11:44:44 -0000 1.1.1.15 +++ b/net/ipv6/ip6_input.c 16 Apr 2004 13:16:25 -0000 1.1.1.15.2.1 @@ -15,6 +15,11 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +/* Changes + * + * Mitsuru KANDA @USAGI and + * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs(). + */ #include #include @@ -39,6 +44,7 @@ #include #include #include +#include @@ -47,7 +53,7 @@ if (skb->dst == NULL) ip6_route_input(skb); - return skb->dst->input(skb); + return dst_input(skb); } int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) @@ -121,13 +127,12 @@ static inline int ip6_input_finish(struct sk_buff *skb) { - struct ipv6hdr *hdr = skb->nh.ipv6h; struct inet6_protocol *ipprot; struct sock *raw_sk; - int nhoff; + unsigned int nhoff; int nexthdr; - int found = 0; u8 hash; + int cksum_sub = 0; skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr); @@ -135,7 +140,7 @@ * Parse extension headers */ - nexthdr = hdr->nexthdr; + nexthdr = skb->nh.ipv6h->nexthdr; nhoff = offsetof(struct ipv6hdr, nexthdr); /* Skip hop-by-hop options, they are already parsed. */ @@ -145,58 +150,46 @@ skb->h.raw += (skb->h.raw[1]+1)<<3; } - /* This check is sort of optimization. - It would be stupid to detect for optional headers, - which are missing with probability of 200% - */ - if (nexthdr != IPPROTO_TCP && nexthdr != IPPROTO_UDP) { - nhoff = ipv6_parse_exthdrs(&skb, nhoff); - if (nhoff < 0) - return 0; - nexthdr = skb->nh.raw[nhoff]; - hdr = skb->nh.ipv6h; - } - +resubmit: if (!pskb_pull(skb, skb->h.raw - skb->data)) goto discard; + nexthdr = skb->nh.raw[nhoff]; - if (skb->ip_summed == CHECKSUM_HW) - skb->csum = csum_sub(skb->csum, - csum_partial(skb->nh.raw, skb->h.raw-skb->nh.raw, 0)); - - raw_sk = raw_v6_htable[nexthdr&(MAX_INET_PROTOS-1)]; + raw_sk = raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]; if (raw_sk) - raw_sk = ipv6_raw_deliver(skb, nexthdr); + ipv6_raw_deliver(skb, nexthdr); hash = nexthdr & (MAX_INET_PROTOS - 1); - for (ipprot = (struct inet6_protocol *) inet6_protos[hash]; - ipprot != NULL; - ipprot = (struct inet6_protocol *) ipprot->next) { - struct sk_buff *buff = skb; - - if (ipprot->protocol != nexthdr) - continue; - - if (ipprot->copy || raw_sk) - buff = skb_clone(skb, GFP_ATOMIC); - - if (buff) - ipprot->handler(buff); - found = 1; - } - - if (raw_sk) { - rawv6_rcv(raw_sk, skb); - sock_put(raw_sk); - found = 1; - } - - /* - * not found: send ICMP parameter problem back - */ - if (!found) { - IP6_INC_STATS_BH(Ip6InUnknownProtos); - icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhoff); + if ((ipprot = inet6_protos[hash]) != NULL) { + int ret; + + if (ipprot->flags & INET6_PROTO_FINAL) { + if (!cksum_sub && skb->ip_summed == CHECKSUM_HW) { + skb->csum = csum_sub(skb->csum, + csum_partial(skb->nh.raw, skb->h.raw-skb->nh.raw, 0)); + cksum_sub++; + } + } + if (!(ipprot->flags & INET6_PROTO_NOPOLICY) && + !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return 0; + } + + ret = ipprot->handler(&skb, &nhoff); + if (ret > 0) + goto resubmit; + else if (ret == 0) + IP6_INC_STATS_BH(Ip6InDelivers); + } else { + if (!raw_sk) { + if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + IP6_INC_STATS_BH(Ip6InUnknownProtos); + icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhoff); + } + } else { + kfree_skb(skb); + } } return 0; @@ -246,7 +239,7 @@ skb2 = skb; } - dst->output(skb2); + dst_output(skb2); } } #endif Index: net/ipv6/ip6_output.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/ip6_output.c,v retrieving revision 1.1.1.21 retrieving revision 1.1.1.21.2.1 diff -u -r1.1.1.21 -r1.1.1.21.2.1 --- a/net/ipv6/ip6_output.c 25 Aug 2003 11:44:44 -0000 1.1.1.21 +++ b/net/ipv6/ip6_output.c 16 Apr 2004 13:16:25 -0000 1.1.1.21.2.1 @@ -23,6 +23,9 @@ * * H. von Brand : Added missing #include * Imran Patel : frag id should be in NBO + * Kazunori MIYAZAWA @USAGI + * : add ip6_append_data and related functions + * for datagram xmit */ #include @@ -49,6 +52,9 @@ #include #include #include +#include + +static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)); static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr) { @@ -99,7 +105,7 @@ } -int ip6_output(struct sk_buff *skb) +int ip6_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; struct net_device *dev = dst->dev; @@ -134,21 +140,27 @@ return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish); } +int ip6_output(struct sk_buff *skb) +{ + if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list)) + return ip6_fragment(skb, ip6_output2); + else + return ip6_output2(skb); +} #ifdef CONFIG_NETFILTER int ip6_route_me_harder(struct sk_buff *skb) { struct ipv6hdr *iph = skb->nh.ipv6h; struct dst_entry *dst; - struct flowi fl; - - fl.proto = iph->nexthdr; - fl.fl6_dst = &iph->daddr; - fl.fl6_src = &iph->saddr; - fl.oif = skb->sk ? skb->sk->bound_dev_if : 0; - fl.fl6_flowlabel = 0; - fl.uli_u.ports.dport = 0; - fl.uli_u.ports.sport = 0; + struct flowi fl = { + .oif = skb->sk ? skb->sk->bound_dev_if : 0, + .nl_u = + { .ip6_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, } }, + .proto = iph->nexthdr, + }; dst = ip6_route_output(skb->sk, &fl); @@ -177,7 +189,7 @@ } } #endif /* CONFIG_NETFILTER */ - return skb->dst->output(skb); + return dst_output(skb); } /* @@ -188,12 +200,13 @@ struct ipv6_txoptions *opt) { struct ipv6_pinfo * np = sk ? &sk->net_pinfo.af_inet6 : NULL; - struct in6_addr *first_hop = fl->nl_u.ip6_u.daddr; + struct in6_addr *first_hop = &fl->fl6_dst; struct dst_entry *dst = skb->dst; struct ipv6hdr *hdr; u8 proto = fl->proto; int seg_len = skb->len; int hlimit; + u32 mtu; if (opt) { int head_room; @@ -231,16 +244,17 @@ if (np) hlimit = np->hop_limit; if (hlimit < 0) - hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit; + hlimit = dst_metric(dst, RTAX_HOPLIMIT); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; hdr->hop_limit = hlimit; - ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr); + ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); ipv6_addr_copy(&hdr->daddr, first_hop); - if (skb->len <= dst->pmtu) { + mtu = dst_pmtu(dst); + if (skb->len <= mtu) { IP6_INC_STATS(Ip6OutRequests); return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute); } @@ -248,7 +262,7 @@ if (net_ratelimit()) printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n"); skb->dev = dst->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); kfree_skb(skb); return -EMSGSIZE; } @@ -302,8 +316,8 @@ hdr->hop_limit = hlimit; hdr->nexthdr = fl->proto; - ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr); - ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr); + ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); + ipv6_addr_copy(&hdr->daddr, &fl->fl6_dst); return hdr; } @@ -507,19 +521,19 @@ struct ipv6_txoptions *opt, int hlimit, int flags) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; - struct in6_addr *final_dst = NULL; + struct in6_addr final_dst_buf, *final_dst = NULL; struct dst_entry *dst; int err = 0; unsigned int pktlength, jumbolen, mtu; - struct in6_addr saddr; if (opt && opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - final_dst = fl->fl6_dst; - fl->fl6_dst = rt0->addr; + ipv6_addr_copy(&final_dst_buf, &fl->fl6_dst); + final_dst = &final_dst_buf; + ipv6_addr_copy(&fl->fl6_dst, rt0->addr); } - if (!fl->oif && ipv6_addr_is_multicast(fl->nl_u.ip6_u.daddr)) + if (!fl->oif && ipv6_addr_is_multicast(&fl->fl6_dst)) fl->oif = np->mcast_oif; dst = __sk_dst_check(sk, np->dst_cookie); @@ -545,9 +559,9 @@ */ if (((rt->rt6i_dst.plen != 128 || - ipv6_addr_cmp(fl->fl6_dst, &rt->rt6i_dst.addr)) + ipv6_addr_cmp(&fl->fl6_dst, &rt->rt6i_dst.addr)) && (np->daddr_cache == NULL || - ipv6_addr_cmp(fl->fl6_dst, np->daddr_cache))) + ipv6_addr_cmp(&fl->fl6_dst, np->daddr_cache))) || (fl->oif && fl->oif != dst->dev->ifindex)) { dst = NULL; } else @@ -563,8 +577,8 @@ return -ENETUNREACH; } - if (fl->fl6_src == NULL) { - err = ipv6_get_saddr(dst, fl->fl6_dst, &saddr); + if (ipv6_addr_any(&fl->fl6_src)) { + err = ipv6_get_saddr(dst, &fl->fl6_dst, &fl->fl6_src); if (err) { #if IP6_DEBUG >= 2 @@ -573,17 +587,23 @@ #endif goto out; } - fl->fl6_src = &saddr; } pktlength = length; + if (dst) { + if ((err = xfrm_lookup(&dst, fl, sk, 0)) < 0) { + dst_release(dst); + return -ENETUNREACH; + } + } + if (hlimit < 0) { - if (ipv6_addr_is_multicast(fl->fl6_dst)) + if (ipv6_addr_is_multicast(&fl->fl6_dst)) hlimit = np->mcast_hops; else hlimit = np->hop_limit; if (hlimit < 0) - hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit; + hlimit = dst_metric(dst, RTAX_HOPLIMIT); } jumbolen = 0; @@ -593,7 +613,7 @@ if (opt) pktlength += opt->opt_flen + opt->opt_nflen; - if (pktlength > 0xFFFF + sizeof(struct ipv6hdr)) { + if (pktlength > sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { /* Jumbo datagram. It is assumed, that in the case of hdrincl jumbo option is supplied by user. @@ -603,7 +623,7 @@ } } - mtu = dst->pmtu; + mtu = dst_pmtu(dst); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; @@ -631,9 +651,8 @@ err = 0; if (flags&MSG_PROBE) goto out; - - skb = sock_alloc_send_skb(sk, pktlength + 15 + - dev->hard_header_len, + /* alloc skb with mtu as we do in the IPv4 stack for IPsec */ + skb = sock_alloc_send_skb(sk, mtu + LL_RESERVED_SPACE(dev), flags & MSG_DONTWAIT, &err); if (skb == NULL) { @@ -664,6 +683,8 @@ err = getfrag(data, &hdr->saddr, ((char *) hdr) + (pktlength - length), 0, length); + if (!opt || !opt->dst1opt) + skb->h.raw = ((char *) hdr) + (pktlength - length); if (!err) { IP6_INC_STATS(Ip6OutRequests); @@ -688,7 +709,9 @@ * cleanup */ out: - ip6_dst_store(sk, dst, fl->nl_u.ip6_u.daddr == &np->daddr ? &np->daddr : NULL); + ip6_dst_store(sk, dst, + !ipv6_addr_cmp(&fl->fl6_dst, &np->daddr) ? + &np->daddr : NULL); if (err > 0) err = np->recverr ? net_xmit_errno(err) : 0; return err; @@ -723,7 +746,7 @@ static inline int ip6_forward_finish(struct sk_buff *skb) { - return skb->dst->output(skb); + return dst_output(skb); } int ip6_forward(struct sk_buff *skb) @@ -735,6 +758,9 @@ if (ipv6_devconf.forwarding == 0) goto error; + if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) + goto drop; + skb->ip_summed = CHECKSUM_NONE; /* @@ -769,6 +795,9 @@ return -ETIMEDOUT; } + if (!xfrm6_route_forward(skb)) + goto drop; + /* IPv6 specs say nothing about it, but it is clear that we cannot send redirects to source routed frames. */ @@ -799,10 +828,10 @@ goto error; } - if (skb->len > dst->pmtu) { + if (skb->len > dst_pmtu(dst)) { /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_pmtu(dst), skb->dev); IP6_INC_STATS_BH(Ip6InTooBigErrors); kfree_skb(skb); return -EMSGSIZE; @@ -826,3 +855,653 @@ kfree_skb(skb); return -EINVAL; } + +static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) +{ + to->pkt_type = from->pkt_type; + to->priority = from->priority; + to->protocol = from->protocol; + to->security = from->security; + to->dst = dst_clone(from->dst); + to->dev = from->dev; + +#ifdef CONFIG_NET_SCHED + to->tc_index = from->tc_index; +#endif +#ifdef CONFIG_NETFILTER + to->nfmark = from->nfmark; + /* Connection association is same as pre-frag packet */ + to->nfct = from->nfct; + nf_conntrack_get(to->nfct); +#ifdef CONFIG_NETFILTER_DEBUG + to->nf_debug = from->nf_debug; +#endif +#endif +} + +int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1); + unsigned int packet_len = skb->tail - skb->nh.raw; + int found_rhdr = 0; + *nexthdr = &skb->nh.ipv6h->nexthdr; + + while (offset + 1 <= packet_len) { + + switch (**nexthdr) { + + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: + if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1; + if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset; + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + break; + default : + return offset; + } + } + + return offset; +} + +static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) +{ + struct net_device *dev; + struct rt6_info *rt = (struct rt6_info*)skb->dst; + struct sk_buff *frag; + struct ipv6hdr *tmp_hdr; + struct frag_hdr *fh; + unsigned int mtu, hlen, left, len; + u32 frag_id = 0; + int ptr, offset = 0, err=0; + u8 *prevhdr, nexthdr = 0; + + dev = rt->u.dst.dev; + hlen = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + + mtu = dst_pmtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr); + + if (skb_shinfo(skb)->frag_list) { + int first_len = skb_pagelen(skb); + + if (first_len - hlen > mtu || + ((first_len - hlen) & 7) || + skb_cloned(skb)) + goto slow_path; + + for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + /* Correct geometry. */ + if (frag->len > mtu || + ((frag->len & 7) && frag->next) || + skb_headroom(frag) < hlen) + goto slow_path; + + /* Correct socket ownership. */ + if (frag->sk == NULL) + goto slow_path; + + /* Partially cloned skb? */ + if (skb_shared(frag)) + goto slow_path; + } + + err = 0; + offset = 0; + frag = skb_shinfo(skb)->frag_list; + skb_shinfo(skb)->frag_list = 0; + /* BUILD HEADER */ + + tmp_hdr = kmalloc(hlen, GFP_ATOMIC); + if (!tmp_hdr) { + IP6_INC_STATS(Ip6FragFails); + return -ENOMEM; + } + + *prevhdr = NEXTHDR_FRAGMENT; + memcpy(tmp_hdr, skb->nh.raw, hlen); + __skb_pull(skb, hlen); + fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr)); + skb->nh.raw = __skb_push(skb, hlen); + memcpy(skb->nh.raw, tmp_hdr, hlen); + + ipv6_select_ident(skb, fh); + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(IP6_MF); + frag_id = fh->identification; + + first_len = skb_pagelen(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr)); + + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. */ + if (frag) { + frag->h.raw = frag->data; + fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr)); + frag->nh.raw = __skb_push(frag, hlen); + memcpy(frag->nh.raw, tmp_hdr, hlen); + offset += skb->len - hlen - sizeof(struct frag_hdr); + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(offset); + if (frag->next != NULL) + fh->frag_off |= htons(IP6_MF); + fh->identification = frag_id; + frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + ip6_copy_metadata(frag, skb); + } + err = output(skb); + + if (err || !frag) + break; + + skb = frag; + frag = skb->next; + skb->next = NULL; + } + + if (tmp_hdr) + kfree(tmp_hdr); + + if (err == 0) { + IP6_INC_STATS(Ip6FragOKs); + return 0; + } + + while (frag) { + skb = frag->next; + kfree_skb(frag); + frag = skb; + } + + IP6_INC_STATS(Ip6FragFails); + return err; + } + +slow_path: + left = skb->len - hlen; /* Space per frame */ + ptr = hlen; /* Where to start from */ + + /* + * Fragment the datagram. + */ + + *prevhdr = NEXTHDR_FRAGMENT; + + /* + * Keep copying data until we run out. + */ + while(left > 0) { + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending upto and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) { + len &= ~7; + } + /* + * Allocate buffer. + */ + + if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) { + NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n")); + err = -ENOMEM; + goto fail; + } + + /* + * Set up data on packet + */ + + ip6_copy_metadata(frag, skb); + skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev)); + skb_put(frag, len + hlen + sizeof(struct frag_hdr)); + frag->nh.raw = frag->data; + fh = (struct frag_hdr*)(frag->data + hlen); + frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr); + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + if (skb->sk) + skb_set_owner_w(frag, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + memcpy(frag->nh.raw, skb->data, hlen); + + /* + * Build fragment header. + */ + fh->nexthdr = nexthdr; + fh->reserved = 0; + if (frag_id) { + ipv6_select_ident(skb, fh); + frag_id = fh->identification; + } else + fh->identification = frag_id; + + /* + * Copy a block of the IP datagram. + */ + if (skb_copy_bits(skb, ptr, frag->h.raw, len)) + BUG(); + left -= len; + + fh->frag_off = htons(offset); + if (left > 0) + fh->frag_off |= htons(IP6_MF); + frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + + ptr += len; + offset += len; + + /* + * Put this fragment into the sending queue. + */ + + IP6_INC_STATS(Ip6FragCreates); + + err = output(frag); + if (err) + goto fail; + } + kfree_skb(skb); + IP6_INC_STATS(Ip6FragOKs); + return err; + +fail: + kfree_skb(skb); + IP6_INC_STATS(Ip6FragFails); + return err; +} + +int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl) +{ + int err = 0; + + if (sk) { + struct ipv6_pinfo *np = inet6_sk(sk); + + *dst = __sk_dst_check(sk, np->dst_cookie); + if (*dst) { + struct rt6_info *rt = (struct rt6_info*)*dst; + + /* Yes, checking route validity in not connected + case is not very simple. Take into account, + that we do not support routing by source, TOS, + and MSG_DONTROUTE --ANK (980726) + + 1. If route was host route, check that + cached destination is current. + If it is network route, we still may + check its validity using saved pointer + to the last used address: daddr_cache. + We do not want to save whole address now, + (because main consumer of this service + is tcp, which has not this problem), + so that the last trick works only on connected + sockets. + 2. oif also should be the same. + */ + + if (((rt->rt6i_dst.plen != 128 || + ipv6_addr_cmp(&fl->fl6_dst, &rt->rt6i_dst.addr)) + && (np->daddr_cache == NULL || + ipv6_addr_cmp(&fl->fl6_dst, np->daddr_cache))) + || (fl->oif && fl->oif != (*dst)->dev->ifindex)) { + *dst = NULL; + } else + dst_hold(*dst); + } + } + + if (*dst == NULL) + *dst = ip6_route_output(sk, fl); + + if ((err = (*dst)->error)) + goto out_err_release; + + if (ipv6_addr_any(&fl->fl6_src)) { + err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src); + + if (err) { +#if IP6_DEBUG >= 2 + printk(KERN_DEBUG "ip6_dst_lookup: " + "no availiable source address\n"); +#endif + goto out_err_release; + } + } + if ((err = xfrm_lookup(dst, fl, sk, 0)) < 0) { + err = -ENETUNREACH; + goto out_err_release; + } + + return 0; + +out_err_release: + dst_release(*dst); + *dst = NULL; + return err; +} + +int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt, + unsigned int flags) +{ + struct inet_opt *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + unsigned int maxfraglen, fragheaderlen; + int exthdrlen; + int hh_len; + int mtu; + int copy = 0; + int err; + int offset = 0; + int csummode = CHECKSUM_NONE; + + if (flags&MSG_PROBE) + return 0; + if (skb_queue_empty(&sk->write_queue)) { + /* + * setup for corking + */ + if (opt) { + if (np->cork.opt == NULL) + np->cork.opt = kmalloc(opt->tot_len, sk->allocation); + memcpy(np->cork.opt, opt, opt->tot_len); + inet->cork.flags |= IPCORK_OPT; + /* need source address above miyazawa*/ + } + dst_hold(&rt->u.dst); + np->cork.rt = rt; + np->cork.fl = *fl; + np->cork.hop_limit = hlimit; + inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst); + inet->cork.length = 0; + inet->sndmsg_page = NULL; + inet->sndmsg_off = 0; + exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0); + length += exthdrlen; + transhdrlen += exthdrlen; + } else { + rt = np->cork.rt; + if (inet->cork.flags & IPCORK_OPT) + opt = np->cork.opt; + transhdrlen = 0; + exthdrlen = 0; + mtu = inet->cork.fragsize; + } + + hh_len = (rt->u.dst.dev->hard_header_len&~15) + 16; + + fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); + + if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { + if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { + ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen); + return -EMSGSIZE; + } + } + + inet->cork.length += length; + + if ((skb = skb_peek_tail(&sk->write_queue)) == NULL) + goto alloc_new_skb; + + while (length > 0) { + if ((copy = maxfraglen - skb->len) <= 0) { + char *data; + unsigned int datalen; + unsigned int fraglen; + unsigned int alloclen; + BUG_TRAP(copy == 0); +alloc_new_skb: + datalen = maxfraglen - fragheaderlen; + if (datalen > length) + datalen = length; + fraglen = datalen + fragheaderlen; + if ((flags & MSG_MORE) && + !(rt->u.dst.dev->features&NETIF_F_SG)) + alloclen = maxfraglen; + else + alloclen = fraglen; + alloclen += sizeof(struct frag_hdr); + if (transhdrlen) { + skb = sock_alloc_send_skb(sk, + alloclen + hh_len + 15, + (flags & MSG_DONTWAIT), &err); + } else { + skb = NULL; + if (atomic_read(&sk->wmem_alloc) <= 2*sk->sndbuf) + skb = sock_wmalloc(sk, + alloclen + hh_len + 15, 1, + sk->allocation); + if (unlikely(skb == NULL)) + err = -ENOBUFS; + } + if (skb == NULL) + goto error; + /* + * Fill in the control structures + */ + skb->ip_summed = csummode; + skb->csum = 0; + /* reserve 8 byte for fragmentation */ + skb_reserve(skb, hh_len+sizeof(struct frag_hdr)); + + /* + * Find where to start putting bytes + */ + data = skb_put(skb, fraglen); + skb->nh.raw = data + exthdrlen; + data += fragheaderlen; + skb->h.raw = data + exthdrlen; + copy = datalen - transhdrlen; + if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + offset += copy; + length -= datalen; + transhdrlen = 0; + exthdrlen = 0; + csummode = CHECKSUM_NONE; + + /* + * Put the packet on the pending queue + */ + __skb_queue_tail(&sk->write_queue, skb); + continue; + } + + if (copy > length) + copy = length; + + if (!(rt->u.dst.dev->features&NETIF_F_SG)) { + unsigned int off; + + off = skb->len; + if (getfrag(from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { + __skb_trim(skb, off); + err = -EFAULT; + goto error; + } + } else { + int i = skb_shinfo(skb)->nr_frags; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; + struct page *page = inet->sndmsg_page; + int off = inet->sndmsg_off; + unsigned int left; + + if (page && (left = PAGE_SIZE - off) > 0) { + if (copy >= left) + copy = left; + if (page != frag->page) { + if (i == MAX_SKB_FRAGS) { + err = -EMSGSIZE; + goto error; + } + get_page(page); + skb_fill_page_desc(skb, i, page, inet->sndmsg_off, 0); + frag = &skb_shinfo(skb)->frags[i]; + } + } else if(i < MAX_SKB_FRAGS) { + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + page = alloc_pages(sk->allocation, 0); + if (page == NULL) { + err = -ENOMEM; + goto error; + } + inet->sndmsg_page = page; + inet->sndmsg_off = 0; + + skb_fill_page_desc(skb, i, page, 0, 0); + frag = &skb_shinfo(skb)->frags[i]; + skb->truesize += PAGE_SIZE; + atomic_add(PAGE_SIZE, &sk->wmem_alloc); + } else { + err = -EMSGSIZE; + goto error; + } + if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { + err = -EFAULT; + goto error; + } + inet->sndmsg_off += copy; + frag->size += copy; + skb->len += copy; + skb->data_len += copy; + } + offset += copy; + length -= copy; + } + return 0; +error: + inet->cork.length -= length; + IP6_INC_STATS(Ip6OutDiscards); + return err; +} + +int ip6_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb, *tmp_skb; + struct sk_buff **tail_skb; + struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; + struct inet_opt *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *hdr; + struct ipv6_txoptions *opt = np->cork.opt; + struct rt6_info *rt = np->cork.rt; + struct flowi *fl = &np->cork.fl; + unsigned char proto = fl->proto; + int err = 0; + + if ((skb = __skb_dequeue(&sk->write_queue)) == NULL) + goto out; + tail_skb = &(skb_shinfo(skb)->frag_list); + + /* move skb->data to ip header from ext header */ + if (skb->data < skb->nh.raw) + __skb_pull(skb, skb->nh.raw - skb->data); + while ((tmp_skb = __skb_dequeue(&sk->write_queue)) != NULL) { + __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); + *tail_skb = tmp_skb; + tail_skb = &(tmp_skb->next); + skb->len += tmp_skb->len; + skb->data_len += tmp_skb->len; +#if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */ + skb->truesize += tmp_skb->truesize; + __sock_put(tmp_skb->sk); + tmp_skb->destructor = NULL; + tmp_skb->sk = NULL; +#endif + } + + ipv6_addr_copy(final_dst, &fl->fl6_dst); + __skb_pull(skb, skb->h.raw - skb->nh.raw); + if (opt && opt->opt_flen) + ipv6_push_frag_opts(skb, opt, &proto); + if (opt && opt->opt_nflen) + ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); + + skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr)); + + *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000); + + if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) + hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + else + hdr->payload_len = 0; + hdr->hop_limit = np->cork.hop_limit; + hdr->nexthdr = proto; + ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); + ipv6_addr_copy(&hdr->daddr, final_dst); + + skb->dst = dst_clone(&rt->u.dst); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output); + if (err) { + if (err > 0) + err = inet->recverr ? net_xmit_errno(err) : 0; + if (err) + goto error; + } + +out: + inet->cork.flags &= ~IPCORK_OPT; + if (np->cork.opt) { + kfree(np->cork.opt); + np->cork.opt = NULL; + } + if (np->cork.rt) { + np->cork.rt = NULL; + } + memset(&np->cork.fl, 0, sizeof(np->cork.fl)); + return err; +error: + goto out; +} + +void ip6_flush_pending_frames(struct sock *sk) +{ + struct inet_opt *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + + while ((skb = __skb_dequeue_tail(&sk->write_queue)) != NULL) + kfree_skb(skb); + + inet->cork.flags &= ~IPCORK_OPT; + + if (np->cork.opt) { + kfree(np->cork.opt); + np->cork.opt = NULL; + } + if (np->cork.rt) { + dst_release(&np->cork.rt->u.dst); + dst_release(&np->cork.rt->u.dst); + np->cork.rt = NULL; + } + memset(&np->cork.fl, 0, sizeof(np->cork.fl)); +} Index: net/ipv6/ip6_tunnel.c =================================================================== RCS file: net/ipv6/ip6_tunnel.c diff -N net/ipv6/ip6_tunnel.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv6/ip6_tunnel.c 16 Apr 2004 13:16:25 -0000 1.6.12.1 @@ -0,0 +1,1260 @@ +/* + * IPv6 over IPv6 tunnel device + * Linux INET6 implementation + * + * Authors: + * Ville Nuorvala + * + * $Id$ + * + * Based on: + * linux/net/ipv6/sit.c + * + * RFC 2473 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Ville Nuorvala"); +MODULE_DESCRIPTION("IPv6-in-IPv6 tunnel"); +MODULE_LICENSE("GPL"); + +#define IPV6_TLV_TEL_DST_SIZE 8 + +#ifdef IP6_TNL_DEBUG +#define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __FUNCTION__) +#else +#define IP6_TNL_TRACE(x...) do {;} while(0) +#endif + +#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) + +/* socket(s) used by ip6ip6_tnl_xmit() for resending packets */ +static struct socket *__ip6_socket[NR_CPUS]; +#define ip6_socket __ip6_socket[smp_processor_id()] + +static void ip6_xmit_lock(void) +{ + local_bh_disable(); + if (unlikely(!spin_trylock(&ip6_socket->sk->lock.slock))) + BUG(); +} + +static void ip6_xmit_unlock(void) +{ + spin_unlock_bh(&ip6_socket->sk->lock.slock); +} + +#define HASH_SIZE 32 + +#define HASH(addr) (((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \ + (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \ + (HASH_SIZE - 1)) + +static int ip6ip6_fb_tnl_dev_init(struct net_device *dev); +static int ip6ip6_tnl_dev_init(struct net_device *dev); + +/* the IPv6 tunnel fallback device */ +static struct net_device ip6ip6_fb_tnl_dev = { + .name = "ip6tnl0", + .init = ip6ip6_fb_tnl_dev_init +}; + +/* the IPv6 fallback tunnel */ +static struct ip6_tnl ip6ip6_fb_tnl = { + .dev = &ip6ip6_fb_tnl_dev, + .parms ={.name = "ip6tnl0", .proto = IPPROTO_IPV6} +}; + +/* lists for storing tunnels in use */ +static struct ip6_tnl *tnls_r_l[HASH_SIZE]; +static struct ip6_tnl *tnls_wc[1]; +static struct ip6_tnl **tnls[2] = { tnls_wc, tnls_r_l }; + +/* lock for the tunnel lists */ +static rwlock_t ip6ip6_lock = RW_LOCK_UNLOCKED; + +/** + * ip6ip6_tnl_lookup - fetch tunnel matching the end-point addresses + * @remote: the address of the tunnel exit-point + * @local: the address of the tunnel entry-point + * + * Return: + * tunnel matching given end-points if found, + * else fallback tunnel if its device is up, + * else %NULL + **/ + +struct ip6_tnl * +ip6ip6_tnl_lookup(struct in6_addr *remote, struct in6_addr *local) +{ + unsigned h0 = HASH(remote); + unsigned h1 = HASH(local); + struct ip6_tnl *t; + + for (t = tnls_r_l[h0 ^ h1]; t; t = t->next) { + if (!ipv6_addr_cmp(local, &t->parms.laddr) && + !ipv6_addr_cmp(remote, &t->parms.raddr) && + (t->dev->flags & IFF_UP)) + return t; + } + if ((t = tnls_wc[0]) != NULL && (t->dev->flags & IFF_UP)) + return t; + + return NULL; +} + +/** + * ip6ip6_bucket - get head of list matching given tunnel parameters + * @p: parameters containing tunnel end-points + * + * Description: + * ip6ip6_bucket() returns the head of the list matching the + * &struct in6_addr entries laddr and raddr in @p. + * + * Return: head of IPv6 tunnel list + **/ + +static struct ip6_tnl ** +ip6ip6_bucket(struct ip6_tnl_parm *p) +{ + struct in6_addr *remote = &p->raddr; + struct in6_addr *local = &p->laddr; + unsigned h = 0; + int prio = 0; + + if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { + prio = 1; + h = HASH(remote) ^ HASH(local); + } + return &tnls[prio][h]; +} + +/** + * ip6ip6_tnl_link - add tunnel to hash table + * @t: tunnel to be added + **/ + +static void +ip6ip6_tnl_link(struct ip6_tnl *t) +{ + struct ip6_tnl **tp = ip6ip6_bucket(&t->parms); + + write_lock_bh(&ip6ip6_lock); + t->next = *tp; + write_unlock_bh(&ip6ip6_lock); + *tp = t; +} + +/** + * ip6ip6_tnl_unlink - remove tunnel from hash table + * @t: tunnel to be removed + **/ + +static void +ip6ip6_tnl_unlink(struct ip6_tnl *t) +{ + struct ip6_tnl **tp; + + for (tp = ip6ip6_bucket(&t->parms); *tp; tp = &(*tp)->next) { + if (t == *tp) { + write_lock_bh(&ip6ip6_lock); + *tp = t->next; + write_unlock_bh(&ip6ip6_lock); + break; + } + } +} + +/** + * ip6_tnl_create() - create a new tunnel + * @p: tunnel parameters + * @pt: pointer to new tunnel + * + * Description: + * Create tunnel matching given parameters. + * + * Return: + * 0 on success + **/ + +static int +ip6_tnl_create(struct ip6_tnl_parm *p, struct ip6_tnl **pt) +{ + struct net_device *dev; + int err = -ENOBUFS; + struct ip6_tnl *t; + + dev = kmalloc(sizeof (*dev) + sizeof (*t), GFP_KERNEL); + if (!dev) + return err; + + memset(dev, 0, sizeof (*dev) + sizeof (*t)); + dev->priv = (void *) (dev + 1); + t = (struct ip6_tnl *) dev->priv; + t->dev = dev; + dev->init = ip6ip6_tnl_dev_init; + memcpy(&t->parms, p, sizeof (*p)); + t->parms.name[IFNAMSIZ - 1] = '\0'; + strcpy(dev->name, t->parms.name); + if (!dev->name[0]) { + int i = 0; + int exists = 0; + + do { + sprintf(dev->name, "ip6tnl%d", ++i); + exists = (__dev_get_by_name(dev->name) != NULL); + } while (i < IP6_TNL_MAX && exists); + + if (i == IP6_TNL_MAX) { + goto failed; + } + memcpy(t->parms.name, dev->name, IFNAMSIZ); + } + SET_MODULE_OWNER(dev); + if ((err = register_netdevice(dev)) < 0) { + goto failed; + } + ip6ip6_tnl_link(t); + *pt = t; + return 0; +failed: + kfree(dev); + return err; +} + +/** + * ip6_tnl_destroy() - destroy old tunnel + * @t: tunnel to be destroyed + * + * Return: + * whatever unregister_netdevice() returns + **/ + +static inline int +ip6_tnl_destroy(struct ip6_tnl *t) +{ + return unregister_netdevice(t->dev); +} + +/** + * ip6ip6_tnl_locate - find or create tunnel matching given parameters + * @p: tunnel parameters + * @create: != 0 if allowed to create new tunnel if no match found + * + * Description: + * ip6ip6_tnl_locate() first tries to locate an existing tunnel + * based on @parms. If this is unsuccessful, but @create is set a new + * tunnel device is created and registered for use. + * + * Return: + * 0 if tunnel located or created, + * -EINVAL if parameters incorrect, + * -ENODEV if no matching tunnel available + **/ + +static int +ip6ip6_tnl_locate(struct ip6_tnl_parm *p, struct ip6_tnl **pt, int create) +{ + struct in6_addr *remote = &p->raddr; + struct in6_addr *local = &p->laddr; + struct ip6_tnl *t; + + if (p->proto != IPPROTO_IPV6) + return -EINVAL; + + for (t = *ip6ip6_bucket(p); t; t = t->next) { + if (!ipv6_addr_cmp(local, &t->parms.laddr) && + !ipv6_addr_cmp(remote, &t->parms.raddr)) { + *pt = t; + return (create ? -EEXIST : 0); + } + } + if (!create) { + return -ENODEV; + } + return ip6_tnl_create(p, pt); +} + +/** + * ip6ip6_tnl_dev_destructor - tunnel device destructor + * @dev: the device to be destroyed + **/ + +static void +ip6ip6_tnl_dev_destructor(struct net_device *dev) +{ + kfree(dev); +} + +/** + * ip6ip6_tnl_dev_uninit - tunnel device uninitializer + * @dev: the device to be destroyed + * + * Description: + * ip6ip6_tnl_dev_uninit() removes tunnel from its list + **/ + +static void +ip6ip6_tnl_dev_uninit(struct net_device *dev) +{ + if (dev == &ip6ip6_fb_tnl_dev) { + write_lock_bh(&ip6ip6_lock); + tnls_wc[0] = NULL; + write_unlock_bh(&ip6ip6_lock); + } else { + struct ip6_tnl *t = (struct ip6_tnl *) dev->priv; + ip6ip6_tnl_unlink(t); + } +} + +/** + * parse_tvl_tnl_enc_lim - handle encapsulation limit option + * @skb: received socket buffer + * + * Return: + * 0 if none was found, + * else index to encapsulation limit + **/ + +static __u16 +parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw) +{ + struct ipv6hdr *ipv6h = (struct ipv6hdr *) raw; + __u8 nexthdr = ipv6h->nexthdr; + __u16 off = sizeof (*ipv6h); + + while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { + __u16 optlen = 0; + struct ipv6_opt_hdr *hdr; + if (raw + off + sizeof (*hdr) > skb->data && + !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr))) + break; + + hdr = (struct ipv6_opt_hdr *) (raw + off); + if (nexthdr == NEXTHDR_FRAGMENT) { + struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr; + if (frag_hdr->frag_off) + break; + optlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) { + optlen = (hdr->hdrlen + 2) << 2; + } else { + optlen = ipv6_optlen(hdr); + } + if (nexthdr == NEXTHDR_DEST) { + __u16 i = off + 2; + while (1) { + struct ipv6_tlv_tnl_enc_lim *tel; + + /* No more room for encapsulation limit */ + if (i + sizeof (*tel) > off + optlen) + break; + + tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i]; + /* return index of option if found and valid */ + if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && + tel->length == 1) + return i; + /* else jump to next option */ + if (tel->type) + i += tel->length + 2; + else + i++; + } + } + nexthdr = hdr->nexthdr; + off += optlen; + } + return 0; +} + +/** + * ip6ip6_err - tunnel error handler + * + * Description: + * ip6ip6_err() should handle errors in the tunnel according + * to the specifications in RFC 2473. + **/ + +void ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *ipv6h = (struct ipv6hdr *) skb->data; + struct ip6_tnl *t; + int rel_msg = 0; + int rel_type = ICMPV6_DEST_UNREACH; + int rel_code = ICMPV6_ADDR_UNREACH; + __u32 rel_info = 0; + __u16 len; + + /* If the packet doesn't contain the original IPv6 header we are + in trouble since we might need the source address for furter + processing of the error. */ + + read_lock(&ip6ip6_lock); + if ((t = ip6ip6_tnl_lookup(&ipv6h->daddr, &ipv6h->saddr)) == NULL) + goto out; + + switch (type) { + __u32 teli; + struct ipv6_tlv_tnl_enc_lim *tel; + __u32 mtu; + case ICMPV6_DEST_UNREACH: + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Path to destination invalid " + "or inactive!\n", t->parms.name); + rel_msg = 1; + break; + case ICMPV6_TIME_EXCEED: + if (code == ICMPV6_EXC_HOPLIMIT) { + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Too small hop limit or " + "routing loop in tunnel!\n", + t->parms.name); + rel_msg = 1; + } + break; + case ICMPV6_PARAMPROB: + /* ignore if parameter problem not caused by a tunnel + encapsulation limit sub-option */ + if (code != ICMPV6_HDR_FIELD) { + break; + } + teli = parse_tlv_tnl_enc_lim(skb, skb->data); + + if (teli && teli == ntohl(info) - 2) { + tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; + if (tel->encap_limit == 0) { + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Too small encapsulation " + "limit or routing loop in " + "tunnel!\n", t->parms.name); + rel_msg = 1; + } + } + break; + case ICMPV6_PKT_TOOBIG: + mtu = ntohl(info) - offset; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + t->dev->mtu = mtu; + + if ((len = sizeof (*ipv6h) + ipv6h->payload_len) > mtu) { + rel_type = ICMPV6_PKT_TOOBIG; + rel_code = 0; + rel_info = mtu; + rel_msg = 1; + } + break; + } + if (rel_msg && pskb_may_pull(skb, offset + sizeof (*ipv6h))) { + struct rt6_info *rt; + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + goto out; + + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, offset); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + rt = rt6_lookup(&skb2->nh.ipv6h->saddr, NULL, 0, 0); + + if (rt && rt->rt6i_dev) + skb2->dev = rt->rt6i_dev; + + icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev); + + if (rt) + dst_release(&rt->u.dst); + + kfree_skb(skb2); + } +out: + read_unlock(&ip6ip6_lock); +} + +/** + * ip6ip6_rcv - decapsulate IPv6 packet and retransmit it locally + * @skb: received socket buffer + * + * Return: 0 + **/ + +int ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + struct sk_buff *skb = *pskb; + struct ipv6hdr *ipv6h; + struct ip6_tnl *t; + + if (!pskb_may_pull(skb, sizeof (*ipv6h))) + goto discard; + + ipv6h = skb->nh.ipv6h; + + read_lock(&ip6ip6_lock); + + if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) { + if (!(t->parms.flags & IP6_TNL_F_CAP_RCV)) { + t->stat.rx_dropped++; + read_unlock(&ip6ip6_lock); + goto discard; + } + secpath_reset(skb); + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb->data; + skb->protocol = htons(ETH_P_IPV6); + skb->pkt_type = PACKET_HOST; + memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + skb->dev = t->dev; + dst_release(skb->dst); + skb->dst = NULL; + t->stat.rx_packets++; + t->stat.rx_bytes += skb->len; + netif_rx(skb); + read_unlock(&ip6ip6_lock); + return 0; + } + read_unlock(&ip6ip6_lock); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); +discard: + kfree_skb(skb); + return 0; +} + +/** + * txopt_len - get necessary size for new &struct ipv6_txoptions + * @orig_opt: old options + * + * Return: + * Size of old one plus size of tunnel encapsulation limit option + **/ + +static inline int +txopt_len(struct ipv6_txoptions *orig_opt) +{ + int len = sizeof (*orig_opt) + 8; + + if (orig_opt && orig_opt->dst0opt) + len += ipv6_optlen(orig_opt->dst0opt); + return len; +} + +/** + * merge_options - add encapsulation limit to original options + * @encap_limit: number of allowed encapsulation limits + * @orig_opt: original options + * + * Return: + * Pointer to new &struct ipv6_txoptions containing the tunnel + * encapsulation limit + **/ + +static struct ipv6_txoptions * +merge_options(struct sock *sk, __u8 encap_limit, + struct ipv6_txoptions *orig_opt) +{ + struct ipv6_tlv_tnl_enc_lim *tel; + struct ipv6_txoptions *opt; + __u8 *raw; + __u8 pad_to = 8; + int opt_len = txopt_len(orig_opt); + + if (!(opt = sock_kmalloc(sk, opt_len, GFP_ATOMIC))) { + return NULL; + } + + memset(opt, 0, opt_len); + opt->tot_len = opt_len; + opt->dst0opt = (struct ipv6_opt_hdr *) (opt + 1); + opt->opt_nflen = 8; + + raw = (__u8 *) opt->dst0opt; + + tel = (struct ipv6_tlv_tnl_enc_lim *) (opt->dst0opt + 1); + tel->type = IPV6_TLV_TNL_ENCAP_LIMIT; + tel->length = 1; + tel->encap_limit = encap_limit; + + if (orig_opt) { + __u8 *orig_raw; + + opt->hopopt = orig_opt->hopopt; + + /* Keep the original destination options properly + aligned and merge possible old paddings to the + new padding option */ + if ((orig_raw = (__u8 *) orig_opt->dst0opt) != NULL) { + __u8 type; + int i = sizeof (struct ipv6_opt_hdr); + pad_to += sizeof (struct ipv6_opt_hdr); + while (i < ipv6_optlen(orig_opt->dst0opt)) { + type = orig_raw[i++]; + if (type == IPV6_TLV_PAD0) + pad_to++; + else if (type == IPV6_TLV_PADN) { + int len = orig_raw[i++]; + i += len; + pad_to += len + 2; + } else { + break; + } + } + opt->dst0opt->hdrlen = orig_opt->dst0opt->hdrlen + 1; + memcpy(raw + pad_to, orig_raw + pad_to - 8, + opt_len - sizeof (*opt) - pad_to); + } + opt->srcrt = orig_opt->srcrt; + opt->opt_nflen += orig_opt->opt_nflen; + + opt->dst1opt = orig_opt->dst1opt; + opt->auth = orig_opt->auth; + opt->opt_flen = orig_opt->opt_flen; + } + raw[5] = IPV6_TLV_PADN; + + /* subtract lengths of destination suboption header, + tunnel encapsulation limit and pad N header */ + raw[6] = pad_to - 7; + + return opt; +} + +static int +ip6ip6_getfrag(void *from, char *to, int offset, int len, int odd, + struct sk_buff *skb) +{ + memcpy(to, (char *) from + offset, len); + return 0; +} + +/** + * ip6ip6_tnl_addr_conflict - compare packet addresses to tunnel's own + * @t: the outgoing tunnel device + * @hdr: IPv6 header from the incoming packet + * + * Description: + * Avoid trivial tunneling loop by checking that tunnel exit-point + * doesn't match source of incoming packet. + * + * Return: + * 1 if conflict, + * 0 else + **/ + +static inline int +ip6ip6_tnl_addr_conflict(struct ip6_tnl *t, struct ipv6hdr *hdr) +{ + return !ipv6_addr_cmp(&t->parms.raddr, &hdr->saddr); +} + +/** + * ip6ip6_tnl_xmit - encapsulate packet and send + * @skb: the outgoing socket buffer + * @dev: the outgoing tunnel device + * + * Description: + * Build new header and do some sanity checks on the packet before sending + * it to ip6_build_xmit(). + * + * Return: + * 0 + **/ + +int ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = (struct ip6_tnl *) dev->priv; + struct net_device_stats *stats = &t->stat; + struct ipv6hdr *ipv6h = skb->nh.ipv6h; + struct ipv6_txoptions *orig_opt = NULL; + struct ipv6_txoptions *opt = NULL; + int encap_limit = -1; + __u16 offset; + struct flowi fl; + struct ip6_flowlabel *fl_lbl = NULL; + int err = 0; + struct dst_entry *dst; + int link_failure = 0; + struct sock *sk = ip6_socket->sk; + struct ipv6_pinfo *np = inet6_sk(sk); + int mtu; + + if (t->recursion++) { + stats->collisions++; + goto tx_err; + } + if (skb->protocol != htons(ETH_P_IPV6) || + !(t->parms.flags & IP6_TNL_F_CAP_XMIT) || + ip6ip6_tnl_addr_conflict(t, ipv6h)) { + goto tx_err; + } + if ((offset = parse_tlv_tnl_enc_lim(skb, skb->nh.raw)) > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->nh.raw[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2, skb->dev); + goto tx_err; + } + encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { + encap_limit = t->parms.encap_limit; + } + ip6_xmit_lock(); + + memcpy(&fl, &t->fl, sizeof (fl)); + + if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)) + fl.fl6_flowlabel |= (*(__u32 *) ipv6h & IPV6_TCLASS_MASK); + if ((t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)) + fl.fl6_flowlabel |= (*(__u32 *) ipv6h & IPV6_FLOWLABEL_MASK); + + if (fl.fl6_flowlabel) { + fl_lbl = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if (fl_lbl) + orig_opt = fl_lbl->opt; + } + if (encap_limit >= 0) { + if (!(opt = merge_options(sk, encap_limit, orig_opt))) { + goto tx_err_free_fl_lbl; + } + } else { + opt = orig_opt; + } + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst) { + if (np->daddr_cache == NULL || + ipv6_addr_cmp(&fl.fl6_dst, np->daddr_cache) || + (fl.oif && fl.oif != dst->dev->ifindex)) { + dst = NULL; + } + } + if (dst == NULL) { + dst = ip6_route_output(sk, &fl); + if (dst->error) { + stats->tx_carrier_errors++; + link_failure = 1; + goto tx_err_dst_release; + } + /* local routing loop */ + if (dst->dev == dev) { + stats->collisions++; + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Local routing loop detected!\n", + t->parms.name); + goto tx_err_dst_release; + } + ipv6_addr_copy(&np->daddr, &fl.fl6_dst); + ipv6_addr_copy(&np->saddr, &fl.fl6_src); + } + mtu = dst_pmtu(dst) - sizeof (*ipv6h); + if (opt) { + mtu -= (opt->opt_nflen + opt->opt_flen); + } + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + if (skb->dst && mtu < dst_pmtu(skb->dst)) { + struct rt6_info *rt = (struct rt6_info *) skb->dst; + rt->rt6i_flags |= RTF_MODIFIED; + rt->u.dst.metrics[RTAX_MTU-1] = mtu; + } + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + goto tx_err_dst_release; + } + err = ip6_append_data(sk, ip6ip6_getfrag, skb->nh.raw, skb->len, 0, + t->parms.hop_limit, opt, &fl, + (struct rt6_info *)dst, MSG_DONTWAIT); + + if (err) { + ip6_flush_pending_frames(sk); + } else { + err = ip6_push_pending_frames(sk); + err = (err < 0 ? err : 0); + } + if (!err) { + stats->tx_bytes += skb->len; + stats->tx_packets++; + } else { + stats->tx_errors++; + stats->tx_aborted_errors++; + } + if (opt && opt != orig_opt) + sock_kfree_s(sk, opt, opt->tot_len); + + fl6_sock_release(fl_lbl); + ip6_dst_store(sk, dst, &np->daddr); + ip6_xmit_unlock(); + kfree_skb(skb); + t->recursion--; + return 0; +tx_err_dst_release: + dst_release(dst); + if (opt && opt != orig_opt) + sock_kfree_s(sk, opt, opt->tot_len); +tx_err_free_fl_lbl: + fl6_sock_release(fl_lbl); + ip6_xmit_unlock(); + if (link_failure) + dst_link_failure(skb); +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + t->recursion--; + return 0; +} + +static void ip6_tnl_set_cap(struct ip6_tnl *t) +{ + struct ip6_tnl_parm *p = &t->parms; + struct in6_addr *laddr = &p->laddr; + struct in6_addr *raddr = &p->raddr; + int ltype = ipv6_addr_type(laddr); + int rtype = ipv6_addr_type(raddr); + + p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV); + + if (ltype != IPV6_ADDR_ANY && rtype != IPV6_ADDR_ANY && + ((ltype|rtype) & + (IPV6_ADDR_UNICAST| + IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL| + IPV6_ADDR_MAPPED|IPV6_ADDR_RESERVED)) == IPV6_ADDR_UNICAST) { + struct net_device *ldev = NULL; + int l_ok = 1; + int r_ok = 1; + + if (p->link) + ldev = dev_get_by_index(p->link); + + if ((ltype&IPV6_ADDR_UNICAST) && !ipv6_chk_addr(laddr, ldev)) + l_ok = 0; + + if ((rtype&IPV6_ADDR_UNICAST) && ipv6_chk_addr(raddr, NULL)) + r_ok = 0; + + if (l_ok && r_ok) { + if (ltype&IPV6_ADDR_UNICAST) + p->flags |= IP6_TNL_F_CAP_XMIT; + if (rtype&IPV6_ADDR_UNICAST) + p->flags |= IP6_TNL_F_CAP_RCV; + } + if (ldev) + dev_put(ldev); + } +} + + +static void ip6ip6_tnl_link_config(struct ip6_tnl *t) +{ + struct net_device *dev = t->dev; + struct ip6_tnl_parm *p = &t->parms; + struct flowi *fl; + /* Set up flowi template */ + fl = &t->fl; + ipv6_addr_copy(&fl->fl6_src, &p->laddr); + ipv6_addr_copy(&fl->fl6_dst, &p->raddr); + fl->oif = p->link; + fl->fl6_flowlabel = 0; + + if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) + fl->fl6_flowlabel |= IPV6_TCLASS_MASK & htonl(p->flowinfo); + if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) + fl->fl6_flowlabel |= IPV6_FLOWLABEL_MASK & htonl(p->flowinfo); + + ip6_tnl_set_cap(t); + + if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) + dev->flags |= IFF_POINTOPOINT; + else + dev->flags &= ~IFF_POINTOPOINT; + + if (p->flags & IP6_TNL_F_CAP_XMIT) { + struct rt6_info *rt = rt6_lookup(&p->raddr, &p->laddr, + p->link, 0); + if (rt) { + struct net_device *rtdev; + if (!(rtdev = rt->rt6i_dev) || + rtdev->type == ARPHRD_TUNNEL6) { + /* as long as tunnels use the same socket + for transmission, locally nested tunnels + won't work */ + dst_release(&rt->u.dst); + goto no_link; + } else { + dev->iflink = rtdev->ifindex; + dev->hard_header_len = rtdev->hard_header_len + + sizeof (struct ipv6hdr); + dev->mtu = rtdev->mtu - sizeof (struct ipv6hdr); + if (dev->mtu < IPV6_MIN_MTU) + dev->mtu = IPV6_MIN_MTU; + + dst_release(&rt->u.dst); + } + } + } else { + no_link: + dev->iflink = 0; + dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); + dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr); + } +} + +/** + * ip6ip6_tnl_change - update the tunnel parameters + * @t: tunnel to be changed + * @p: tunnel configuration parameters + * @active: != 0 if tunnel is ready for use + * + * Description: + * ip6ip6_tnl_change() updates the tunnel parameters + **/ + +static int +ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) +{ + ipv6_addr_copy(&t->parms.laddr, &p->laddr); + ipv6_addr_copy(&t->parms.raddr, &p->raddr); + t->parms.flags = p->flags; + t->parms.hop_limit = p->hop_limit; + t->parms.encap_limit = p->encap_limit; + t->parms.flowinfo = p->flowinfo; + ip6ip6_tnl_link_config(t); + return 0; +} + +/** + * ip6ip6_tnl_ioctl - configure ipv6 tunnels from userspace + * @dev: virtual device associated with tunnel + * @ifr: parameters passed from userspace + * @cmd: command to be performed + * + * Description: + * ip6ip6_tnl_ioctl() is used for managing IPv6 tunnels + * from userspace. + * + * The possible commands are the following: + * %SIOCGETTUNNEL: get tunnel parameters for device + * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters + * %SIOCCHGTUNNEL: change tunnel parameters to those given + * %SIOCDELTUNNEL: delete tunnel + * + * The fallback device "ip6tnl0", created during module + * initialization, can be used for creating other tunnel devices. + * + * Return: + * 0 on success, + * %-EFAULT if unable to copy data to or from userspace, + * %-EPERM if current process hasn't %CAP_NET_ADMIN set + * %-EINVAL if passed tunnel parameters are invalid, + * %-EEXIST if changing a tunnel's parameters would cause a conflict + * %-ENODEV if attempting to change or delete a nonexisting device + **/ + +static int +ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + int create; + struct ip6_tnl_parm p; + struct ip6_tnl *t = NULL; + + switch (cmd) { + case SIOCGETTUNNEL: + if (dev == &ip6ip6_fb_tnl_dev) { + if (copy_from_user(&p, + ifr->ifr_ifru.ifru_data, + sizeof (p))) { + err = -EFAULT; + break; + } + if ((err = ip6ip6_tnl_locate(&p, &t, 0)) == -ENODEV) + t = (struct ip6_tnl *) dev->priv; + else if (err) + break; + } else + t = (struct ip6_tnl *) dev->priv; + + memcpy(&p, &t->parms, sizeof (p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) { + err = -EFAULT; + } + break; + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + create = (cmd == SIOCADDTUNNEL); + if (!capable(CAP_NET_ADMIN)) + break; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) { + err = -EFAULT; + break; + } + if (!create && dev != &ip6ip6_fb_tnl_dev) { + t = (struct ip6_tnl *) dev->priv; + } + if (!t && (err = ip6ip6_tnl_locate(&p, &t, create))) { + break; + } + if (cmd == SIOCCHGTUNNEL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + ip6ip6_tnl_unlink(t); + err = ip6ip6_tnl_change(t, &p); + ip6ip6_tnl_link(t); + netdev_state_change(dev); + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, + &t->parms, sizeof (p))) { + err = -EFAULT; + } else { + err = 0; + } + break; + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + + if (dev == &ip6ip6_fb_tnl_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, + sizeof (p))) { + err = -EFAULT; + break; + } + err = ip6ip6_tnl_locate(&p, &t, 0); + if (err) + break; + if (t == &ip6ip6_fb_tnl) { + err = -EPERM; + break; + } + } else { + t = (struct ip6_tnl *) dev->priv; + } + err = ip6_tnl_destroy(t); + break; + default: + err = -EINVAL; + } + return err; +} + +/** + * ip6ip6_tnl_get_stats - return the stats for tunnel device + * @dev: virtual device associated with tunnel + * + * Return: stats for device + **/ + +static struct net_device_stats * +ip6ip6_tnl_get_stats(struct net_device *dev) +{ + return &(((struct ip6_tnl *) dev->priv)->stat); +} + +/** + * ip6ip6_tnl_change_mtu - change mtu manually for tunnel device + * @dev: virtual device associated with tunnel + * @new_mtu: the new mtu + * + * Return: + * 0 on success, + * %-EINVAL if mtu too small + **/ + +static int +ip6ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < IPV6_MIN_MTU) { + return -EINVAL; + } + dev->mtu = new_mtu; + return 0; +} + +/** + * ip6ip6_tnl_dev_init_gen - general initializer for all tunnel devices + * @dev: virtual device associated with tunnel + * + * Description: + * Set function pointers and initialize the &struct flowi template used + * by the tunnel. + **/ + +static void +ip6ip6_tnl_dev_init_gen(struct net_device *dev) +{ + struct ip6_tnl *t = (struct ip6_tnl *) dev->priv; + struct flowi *fl = &t->fl; + + memset(fl, 0, sizeof (*fl)); + fl->proto = IPPROTO_IPV6; + + dev->destructor = ip6ip6_tnl_dev_destructor; + dev->uninit = ip6ip6_tnl_dev_uninit; + dev->hard_start_xmit = ip6ip6_tnl_xmit; + dev->get_stats = ip6ip6_tnl_get_stats; + dev->do_ioctl = ip6ip6_tnl_ioctl; + dev->change_mtu = ip6ip6_tnl_change_mtu; + dev->type = ARPHRD_TUNNEL6; + dev->flags |= IFF_NOARP; + if (ipv6_addr_type(&t->parms.raddr) & IPV6_ADDR_UNICAST && + ipv6_addr_type(&t->parms.laddr) & IPV6_ADDR_UNICAST) + dev->flags |= IFF_POINTOPOINT; + /* Hmm... MAX_ADDR_LEN is 8, so the ipv6 addresses can't be + copied to dev->dev_addr and dev->broadcast, like the ipv4 + addresses were in ipip.c, ip_gre.c and sit.c. */ + dev->addr_len = 0; +} + +/** + * ip6ip6_tnl_dev_init - initializer for all non fallback tunnel devices + * @dev: virtual device associated with tunnel + **/ + +static int +ip6ip6_tnl_dev_init(struct net_device *dev) +{ + struct ip6_tnl *t = (struct ip6_tnl *) dev->priv; + ip6ip6_tnl_dev_init_gen(dev); + ip6ip6_tnl_link_config(t); + return 0; +} + +/** + * ip6ip6_fb_tnl_dev_init - initializer for fallback tunnel device + * @dev: fallback device + * + * Return: 0 + **/ + +int ip6ip6_fb_tnl_dev_init(struct net_device *dev) +{ + ip6ip6_tnl_dev_init_gen(dev); + tnls_wc[0] = &ip6ip6_fb_tnl; + return 0; +} + +static struct inet6_protocol ip6ip6_protocol = { + .handler = ip6ip6_rcv, + .err_handler = ip6ip6_err, + .flags = INET6_PROTO_FINAL +}; + +/** + * ip6_tunnel_init - register protocol and reserve needed resources + * + * Return: 0 on success + **/ + +int __init ip6_tunnel_init(void) +{ + int i, j, err; + struct sock *sk; + struct ipv6_pinfo *np; + + ip6ip6_fb_tnl_dev.priv = (void *) &ip6ip6_fb_tnl; + + for (i = 0; i < NR_CPUS; i++) { + err = sock_create(PF_INET6, SOCK_RAW, IPPROTO_IPV6, + &__ip6_socket[i]); + if (err < 0) { + printk(KERN_ERR + "Failed to create the IPv6 tunnel socket " + "(err %d).\n", + err); + goto fail; + } + sk = __ip6_socket[i]->sk; + sk->allocation = GFP_ATOMIC; + + np = inet6_sk(sk); + np->hop_limit = 255; + np->mc_loop = 0; + + sk->prot->unhash(sk); + } + if ((err = inet6_add_protocol(&ip6ip6_protocol, IPPROTO_IPV6)) < 0) { + printk(KERN_ERR "Failed to register IPv6 protocol\n"); + goto fail; + } + + SET_MODULE_OWNER(&ip6ip6_fb_tnl_dev); + register_netdev(&ip6ip6_fb_tnl_dev); + + return 0; +fail: + for (j = 0; j < i; j++) { + sock_release(__ip6_socket[j]); + __ip6_socket[j] = NULL; + } + return err; +} + +/** + * ip6_tunnel_cleanup - free resources and unregister protocol + **/ + +void ip6_tunnel_cleanup(void) +{ + int i; + + unregister_netdev(&ip6ip6_fb_tnl_dev); + + inet6_del_protocol(&ip6ip6_protocol, IPPROTO_IPV6); + + for (i = 0; i < NR_CPUS; i++) { + sock_release(__ip6_socket[i]); + __ip6_socket[i] = NULL; + } +} + +#ifdef MODULE +module_init(ip6_tunnel_init); +module_exit(ip6_tunnel_cleanup); +#endif Index: net/ipv6/ipcomp6.c =================================================================== RCS file: net/ipv6/ipcomp6.c diff -N net/ipv6/ipcomp6.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv6/ipcomp6.c 16 Apr 2004 13:16:25 -0000 1.6.12.1 @@ -0,0 +1,378 @@ +/* + * IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173 + * + * Copyright (C)2003 USAGI/WIDE Project + * + * Author Mitsuru KANDA + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * [Memo] + * + * Outbound: + * The compression of IP datagram MUST be done before AH/ESP processing, + * fragmentation, and the addition of Hop-by-Hop/Routing header. + * + * Inbound: + * The decompression of IP datagram MUST be done after the reassembly, + * AH/ESP processing. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* XXX no ipv6 ipcomp specific */ +#define NIP6(addr) \ + ntohs((addr).s6_addr16[0]),\ + ntohs((addr).s6_addr16[1]),\ + ntohs((addr).s6_addr16[2]),\ + ntohs((addr).s6_addr16[3]),\ + ntohs((addr).s6_addr16[4]),\ + ntohs((addr).s6_addr16[5]),\ + ntohs((addr).s6_addr16[6]),\ + ntohs((addr).s6_addr16[7]) + +static int ipcomp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + int err = 0; + u8 nexthdr = 0; + u8 *prevhdr; + int hdr_len = skb->h.raw - skb->nh.raw; + unsigned char *tmp_hdr = NULL; + struct ipv6hdr *iph; + int plen, dlen; + struct ipcomp_data *ipcd = x->data; + u8 *start, *scratch = ipcd->scratch; + + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { + err = -ENOMEM; + goto out; + } + + skb->ip_summed = CHECKSUM_NONE; + + /* Remove ipcomp header and decompress original payload */ + iph = skb->nh.ipv6h; + tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); + if (!tmp_hdr) + goto out; + memcpy(tmp_hdr, iph, hdr_len); + nexthdr = *(u8 *)skb->data; + skb_pull(skb, sizeof(struct ipv6_comp_hdr)); + skb->nh.raw += sizeof(struct ipv6_comp_hdr); + memcpy(skb->nh.raw, tmp_hdr, hdr_len); + iph = skb->nh.ipv6h; + iph->payload_len = htons(ntohs(iph->payload_len) - sizeof(struct ipv6_comp_hdr)); + skb->h.raw = skb->data; + + /* decompression */ + plen = skb->len; + dlen = IPCOMP_SCRATCH_SIZE; + start = skb->data; + + err = crypto_comp_decompress(ipcd->tfm, start, plen, scratch, &dlen); + if (err) { + err = -EINVAL; + goto out; + } + + if (dlen < (plen + sizeof(struct ipv6_comp_hdr))) { + err = -EINVAL; + goto out; + } + + err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC); + if (err) { + goto out; + } + + skb_put(skb, dlen - plen); + memcpy(skb->data, scratch, dlen); + + iph = skb->nh.ipv6h; + iph->payload_len = htons(skb->len); + + ip6_find_1stfragopt(skb, &prevhdr); + *prevhdr = nexthdr; +out: + if (tmp_hdr) + kfree(tmp_hdr); + if (err) + goto error_out; + return nexthdr; +error_out: + return err; +} + +static int ipcomp6_output(struct sk_buff *skb) +{ + int err; + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct ipv6hdr *tmp_iph = NULL, *iph, *top_iph; + int hdr_len = 0; + struct ipv6_comp_hdr *ipch; + struct ipcomp_data *ipcd = x->data; + u8 *prevhdr; + u8 nexthdr = 0; + int plen, dlen; + u8 *start, *scratch = ipcd->scratch; + + if (skb->ip_summed == CHECKSUM_HW && skb_checksum_help(skb) == NULL) { + err = -EINVAL; + goto error_nolock; + } + + spin_lock_bh(&x->lock); + + err = xfrm_check_output(x, skb, AF_INET6); + if (err) + goto error; + + if (x->props.mode) { + hdr_len = sizeof(struct ipv6hdr); + nexthdr = IPPROTO_IPV6; + iph = skb->nh.ipv6h; + top_iph = (struct ipv6hdr *)skb_push(skb, sizeof(struct ipv6hdr)); + top_iph->version = 6; + top_iph->priority = iph->priority; + top_iph->flow_lbl[0] = iph->flow_lbl[0]; + top_iph->flow_lbl[1] = iph->flow_lbl[1]; + top_iph->flow_lbl[2] = iph->flow_lbl[2]; + top_iph->nexthdr = IPPROTO_IPV6; /* initial */ + top_iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + top_iph->hop_limit = iph->hop_limit; + memcpy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr, sizeof(struct in6_addr)); + memcpy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr, sizeof(struct in6_addr)); + skb->nh.raw = skb->data; /* == top_iph */ + skb->h.raw = skb->nh.raw + hdr_len; + } else { + hdr_len = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + } + + /* check whether datagram len is larger than threshold */ + if ((skb->len - hdr_len) < ipcd->threshold) { + goto out_ok; + } + + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { + err = -ENOMEM; + goto error; + } + + /* compression */ + plen = skb->len - hdr_len; + dlen = IPCOMP_SCRATCH_SIZE; + start = skb->data + hdr_len; + + err = crypto_comp_compress(ipcd->tfm, start, plen, scratch, &dlen); + if (err) { + goto error; + } + if ((dlen + sizeof(struct ipv6_comp_hdr)) >= plen) { + goto out_ok; + } + memcpy(start, scratch, dlen); + pskb_trim(skb, hdr_len+dlen); + + /* insert ipcomp header and replace datagram */ + tmp_iph = kmalloc(hdr_len, GFP_ATOMIC); + if (!tmp_iph) { + err = -ENOMEM; + goto error; + } + memcpy(tmp_iph, skb->nh.raw, hdr_len); + top_iph = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6_comp_hdr)); + memcpy(top_iph, tmp_iph, hdr_len); + kfree(tmp_iph); + + if (x->props.mode && (x->props.flags & XFRM_STATE_NOECN)) + IP6_ECN_clear(top_iph); + top_iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb->nh.raw = skb->data; /* top_iph */ + ip6_find_1stfragopt(skb, &prevhdr); + *prevhdr = IPPROTO_COMP; + + ipch = (struct ipv6_comp_hdr *)((unsigned char *)top_iph + hdr_len); + ipch->nexthdr = nexthdr; + ipch->flags = 0; + ipch->cpi = htons((u16 )ntohl(x->id.spi)); + + skb->h.raw = (unsigned char*)ipch; +out_ok: + x->curlft.bytes += skb->len; + x->curlft.packets++; + spin_unlock_bh(&x->lock); + + if ((skb->dst = dst_pop(dst)) == NULL) { + err = -EHOSTUNREACH; + goto error_nolock; + } + err = NET_XMIT_BYPASS; + +out_exit: + return err; +error: + spin_unlock_bh(&x->lock); +error_nolock: + kfree_skb(skb); + goto out_exit; +} + +static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + u32 spi; + struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + struct ipv6_comp_hdr *ipcomph = (struct ipv6_comp_hdr*)(skb->data+offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH || type != ICMPV6_PKT_TOOBIG) + return; + + spi = ntohl(ntohs(ipcomph->cpi)); + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET6); + if (!x) + return; + + printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/" + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + spi, NIP6(iph->daddr)); + xfrm_state_put(x); +} + +static void ipcomp6_free_data(struct ipcomp_data *ipcd) +{ + if (ipcd->tfm) + crypto_free_tfm(ipcd->tfm); + if (ipcd->scratch) + kfree(ipcd->scratch); +} + +static void ipcomp6_destroy(struct xfrm_state *x) +{ + struct ipcomp_data *ipcd = x->data; + if (!ipcd) + return; + ipcomp6_free_data(ipcd); + kfree(ipcd); +} + +static int ipcomp6_init_state(struct xfrm_state *x, void *args) +{ + int err; + struct ipcomp_data *ipcd; + struct xfrm_algo_desc *calg_desc; + + err = -EINVAL; + if (!x->calg) + goto out; + + err = -ENOMEM; + ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL); + if (!ipcd) + goto error; + + memset(ipcd, 0, sizeof(*ipcd)); + x->props.header_len = sizeof(struct ipv6_comp_hdr); + if (x->props.mode) + x->props.header_len += sizeof(struct ipv6hdr); + + ipcd->scratch = kmalloc(IPCOMP_SCRATCH_SIZE, GFP_KERNEL); + if (!ipcd->scratch) + goto error; + + ipcd->tfm = crypto_alloc_tfm(x->calg->alg_name, 0); + if (!ipcd->tfm) + goto error; + + calg_desc = xfrm_calg_get_byname(x->calg->alg_name); + BUG_ON(!calg_desc); + ipcd->threshold = calg_desc->uinfo.comp.threshold; + x->data = ipcd; + err = 0; +out: + return err; +error: + if (ipcd) { + ipcomp6_free_data(ipcd); + kfree(ipcd); + } + + goto out; +} + +static struct xfrm_type ipcomp6_type = +{ + .description = "IPCOMP6", + .owner = THIS_MODULE, + .proto = IPPROTO_COMP, + .init_state = ipcomp6_init_state, + .destructor = ipcomp6_destroy, + .input = ipcomp6_input, + .output = ipcomp6_output, +}; + +static struct inet6_protocol ipcomp6_protocol = +{ + .handler = xfrm6_rcv, + .err_handler = ipcomp6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static int __init ipcomp6_init(void) +{ + if (xfrm_register_type(&ipcomp6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipcomp6 init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) { + printk(KERN_INFO "ipcomp6 init: can't add protocol\n"); + xfrm_unregister_type(&ipcomp6_type, AF_INET6); + return -EAGAIN; + } + return 0; +} + +static void __exit ipcomp6_fini(void) +{ + if (inet6_del_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) + printk(KERN_INFO "ipv6 ipcomp close: can't remove protocol\n"); + if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 ipcomp close: can't remove xfrm type\n"); +} + +module_init(ipcomp6_init); +module_exit(ipcomp6_fini); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173"); +MODULE_AUTHOR("Mitsuru KANDA "); + + Index: net/ipv6/ipv6_sockglue.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/ipv6_sockglue.c,v retrieving revision 1.1.1.19 retrieving revision 1.1.1.19.2.1 diff -u -r1.1.1.19 -r1.1.1.19.2.1 --- a/net/ipv6/ipv6_sockglue.c 14 Apr 2004 13:05:41 -0000 1.1.1.19 +++ b/net/ipv6/ipv6_sockglue.c 16 Apr 2004 13:16:25 -0000 1.1.1.19.2.1 @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -517,6 +518,10 @@ case IPV6_FLOWLABEL_MGR: retv = ipv6_flowlabel_opt(sk, optval, optlen); break; + case IPV6_IPSEC_POLICY: + case IPV6_XFRM_POLICY: + retv = xfrm_user_policy(sk, optname, optval, optlen); + break; #ifdef CONFIG_NETFILTER default: @@ -550,6 +555,15 @@ if (get_user(len, optlen)) return -EFAULT; switch (optname) { + case IPV6_ADDRFORM: + if (sk->protocol != IPPROTO_UDP && + sk->protocol != IPPROTO_TCP) + return -EINVAL; + if (sk->state != TCP_ESTABLISHED) + return -ENOTCONN; + val = sk->family; + break; + case IPV6_PKTOPTIONS: { struct msghdr msg; @@ -595,7 +609,7 @@ lock_sock(sk); dst = sk_dst_get(sk); if (dst) { - val = dst->pmtu; + val = dst_pmtu(dst) - dst->header_len; dst_release(dst); } release_sock(sk); Index: net/ipv6/ipv6_syms.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/ipv6_syms.c,v retrieving revision 1.1.1.11 retrieving revision 1.1.1.11.2.1 diff -u -r1.1.1.11 -r1.1.1.11.2.1 --- a/net/ipv6/ipv6_syms.c 14 Apr 2004 13:05:41 -0000 1.1.1.11 +++ b/net/ipv6/ipv6_syms.c 16 Apr 2004 13:16:25 -0000 1.1.1.11.2.1 @@ -6,6 +6,7 @@ #include #include #include +#include EXPORT_SYMBOL(ipv6_addr_type); EXPORT_SYMBOL(icmpv6_send); @@ -33,5 +34,15 @@ EXPORT_SYMBOL(ipv6_get_saddr); EXPORT_SYMBOL(ipv6_chk_addr); EXPORT_SYMBOL(in6_dev_finish_destroy); +EXPORT_SYMBOL(ip6_find_1stfragopt); +#ifdef CONFIG_XFRM +EXPORT_SYMBOL(xfrm6_rcv); +#endif +EXPORT_SYMBOL(rt6_lookup); +EXPORT_SYMBOL(fl6_sock_lookup); +EXPORT_SYMBOL(ipv6_ext_hdr); +EXPORT_SYMBOL(ip6_append_data); +EXPORT_SYMBOL(ip6_flush_pending_frames); +EXPORT_SYMBOL(ip6_push_pending_frames); EXPORT_SYMBOL(ipv6_skip_exthdr); Index: net/ipv6/ndisc.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/ndisc.c,v retrieving revision 1.1.1.28 retrieving revision 1.1.1.28.2.1 diff -u -r1.1.1.28 -r1.1.1.28.2.1 --- a/net/ipv6/ndisc.c 14 Apr 2004 13:05:41 -0000 1.1.1.28 +++ b/net/ipv6/ndisc.c 16 Apr 2004 13:16:25 -0000 1.1.1.28.2.1 @@ -71,6 +71,7 @@ #include #include +#include #include #include @@ -138,6 +139,19 @@ 30*HZ, 128, 512, 1024, }; +/* ND options */ +struct ndisc_options { + struct nd_opt_hdr *nd_opt_array[7]; + struct nd_opt_hdr *nd_opt_piend; +}; + +#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] +#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] +#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] +#define nd_opts_pi_end nd_opt_piend +#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] +#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] + #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) static u8 *ndisc_fill_option(u8 *opt, int type, void *data, int data_len) @@ -154,8 +168,8 @@ return opt + space; } -struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, - struct nd_opt_hdr *end) +static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, + struct nd_opt_hdr *end) { int type; if (!cur || !end || cur >= end) @@ -167,8 +181,8 @@ return (cur <= end && cur->nd_opt_type == type ? cur : NULL); } -struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, - struct ndisc_options *ndopts) +static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, + struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; @@ -333,8 +347,6 @@ unsigned char ha[MAX_ADDR_LEN]; unsigned char *h_dest = NULL; - skb_reserve(skb, (dev->hard_header_len + 15) & ~15); - if (dev->hard_header) { if (ipv6_addr_type(daddr) & IPV6_ADDR_MULTICAST) { ndisc_mc_map(daddr, ha, dev, 1); @@ -371,11 +383,38 @@ * Send a Neighbour Advertisement */ +static int ndisc_output(struct sk_buff *skb) +{ + if (skb) { + struct neighbour *neigh = (skb->dst ? skb->dst->neighbour : NULL); + if (ndisc_build_ll_hdr(skb, skb->dev, &skb->nh.ipv6h->daddr, neigh, skb->len) == 0) { + kfree_skb(skb); + return -EINVAL; + } + dev_queue_xmit(skb); + return 0; + } + return -EINVAL; +} + +static inline void ndisc_flow_init(struct flowi *fl, u8 type, + struct in6_addr *saddr, struct in6_addr *daddr) +{ + memset(fl, 0, sizeof(*fl)); + ipv6_addr_copy(&fl->fl6_src, saddr); + ipv6_addr_copy(&fl->fl6_dst, daddr); + fl->proto = IPPROTO_ICMPV6; + fl->fl_icmp_type = type; + fl->fl_icmp_code = 0; +} + void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, struct in6_addr *daddr, struct in6_addr *solicited_addr, - int router, int solicited, int override, int inc_opt) + int router, int solicited, int override, int inc_opt) { - static struct in6_addr tmpaddr; + struct flowi fl; + struct dst_entry* dst; + struct in6_addr tmpaddr; struct inet6_ifaddr *ifp; struct sock *sk = ndisc_socket->sk; struct in6_addr *src_addr; @@ -386,6 +425,29 @@ len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + /* for anycast or proxy, solicited_addr != src_addr */ + ifp = ipv6_get_ifaddr(solicited_addr, dev); + if (ifp) { + src_addr = solicited_addr; + in6_ifa_put(ifp); + } else { + if (ipv6_dev_get_saddr(dev, daddr, &tmpaddr, 0)) + return; + src_addr = &tmpaddr; + } + + ndisc_flow_init(&fl, NDISC_NEIGHBOUR_ADVERTISEMENT, src_addr, daddr); + + dst = ndisc_dst_alloc(dev, neigh, ndisc_output); + if (!dst) + return; + + err = xfrm_lookup(&dst, &fl, NULL, 0); + if (err < 0) { + dst_release(dst); + return; + } + if (inc_opt) { if (dev->addr_len) len += NDISC_OPT_SPACE(dev->addr_len); @@ -398,27 +460,14 @@ if (skb == NULL) { ND_PRINTK1("send_na: alloc skb failed\n"); - return; - } - /* for anycast or proxy, solicited_addr != src_addr */ - ifp = ipv6_get_ifaddr(solicited_addr, dev); - if (ifp) { - src_addr = solicited_addr; - in6_ifa_put(ifp); - } else { - if (ipv6_dev_get_saddr(dev, daddr, &tmpaddr, 0)) - return; - src_addr = &tmpaddr; - } - - if (ndisc_build_ll_hdr(skb, dev, daddr, neigh, len) == 0) { - kfree_skb(skb); + dst_release(dst); return; } + skb_reserve(skb, (dev->hard_header_len + 15) & ~15); ip6_nd_hdr(sk, skb, dev, src_addr, daddr, IPPROTO_ICMPV6, len); - msg = (struct nd_msg *) skb_put(skb, len); + skb->h.raw = (unsigned char*) msg = (struct nd_msg *) skb_put(skb, len); msg->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; msg->icmph.icmp6_code = 0; @@ -441,7 +490,8 @@ csum_partial((__u8 *) msg, len, 0)); - dev_queue_xmit(skb); + skb->dst = dst; + dst_output(skb); ICMP6_INC_STATS(Icmp6OutNeighborAdvertisements); ICMP6_INC_STATS(Icmp6OutMsgs); @@ -451,6 +501,8 @@ struct in6_addr *solicit, struct in6_addr *daddr, struct in6_addr *saddr) { + struct flowi fl; + struct dst_entry* dst; struct sock *sk = ndisc_socket->sk; struct sk_buff *skb; struct nd_msg *msg; @@ -465,6 +517,18 @@ saddr = &addr_buf; } + ndisc_flow_init(&fl, NDISC_NEIGHBOUR_SOLICITATION, saddr, daddr); + + dst = ndisc_dst_alloc(dev, neigh, ndisc_output); + if (!dst) + return; + + err = xfrm_lookup(&dst, &fl, NULL, 0); + if (err < 0) { + dst_release(dst); + return; + } + len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); send_llinfo = dev->addr_len && ipv6_addr_type(saddr) != IPV6_ADDR_ANY; if (send_llinfo) @@ -474,17 +538,14 @@ 1, &err); if (skb == NULL) { ND_PRINTK1("send_ns: alloc skb failed\n"); + dst_release(dst); return; } - if (ndisc_build_ll_hdr(skb, dev, daddr, neigh, len) == 0) { - kfree_skb(skb); - return; - } - + skb_reserve(skb, (dev->hard_header_len + 15) & ~15); ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); - msg = (struct nd_msg *)skb_put(skb, len); + skb->h.raw = (unsigned char*) msg = (struct nd_msg *)skb_put(skb, len); msg->icmph.icmp6_type = NDISC_NEIGHBOUR_SOLICITATION; msg->icmph.icmp6_code = 0; msg->icmph.icmp6_cksum = 0; @@ -503,7 +564,8 @@ csum_partial((__u8 *) msg, len, 0)); /* send it! */ - dev_queue_xmit(skb); + skb->dst = dst; + dst_output(skb); ICMP6_INC_STATS(Icmp6OutNeighborSolicits); ICMP6_INC_STATS(Icmp6OutMsgs); @@ -512,6 +574,8 @@ void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr) { + struct flowi fl; + struct dst_entry* dst; struct sock *sk = ndisc_socket->sk; struct sk_buff *skb; struct icmp6hdr *hdr; @@ -519,6 +583,18 @@ int len; int err; + ndisc_flow_init(&fl, NDISC_ROUTER_SOLICITATION, saddr, daddr); + + dst = ndisc_dst_alloc(dev, NULL, ndisc_output); + if (!dst) + return; + + err = xfrm_lookup(&dst, &fl, NULL, 0); + if (err < 0) { + dst_release(dst); + return; + } + len = sizeof(struct icmp6hdr); if (dev->addr_len) len += NDISC_OPT_SPACE(dev->addr_len); @@ -530,14 +606,10 @@ return; } - if (ndisc_build_ll_hdr(skb, dev, daddr, NULL, len) == 0) { - kfree_skb(skb); - return; - } - + skb_reserve(skb, (dev->hard_header_len + 15) & ~15); ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); - hdr = (struct icmp6hdr *) skb_put(skb, len); + skb->h.raw = (unsigned char*) hdr = (struct icmp6hdr *) skb_put(skb, len); hdr->icmp6_type = NDISC_ROUTER_SOLICITATION; hdr->icmp6_code = 0; hdr->icmp6_cksum = 0; @@ -554,7 +626,8 @@ csum_partial((__u8 *) hdr, len, 0)); /* send it! */ - dev_queue_xmit(skb); + skb->dst = dst; + dst_output(skb); ICMP6_INC_STATS(Icmp6OutRouterSolicits); ICMP6_INC_STATS(Icmp6OutMsgs); @@ -598,7 +671,7 @@ } } -void ndisc_recv_ns(struct sk_buff *skb) +static void ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb->h.raw; struct in6_addr *saddr = &skb->nh.ipv6h->saddr; @@ -610,6 +683,7 @@ struct net_device *dev = skb->dev; struct inet6_ifaddr *ifp; struct neighbour *neigh; + int addr_type = ipv6_addr_type(saddr); if (skb->len < sizeof(struct nd_msg)) { if (net_ratelimit()) @@ -623,6 +697,20 @@ return; } + /* + * RFC2461 7.1.1: + * DAD has to be destined for solicited node multicast address. + */ + if (addr_type == IPV6_ADDR_ANY && + !(daddr->s6_addr32[0] == htonl(0xff020000) && + daddr->s6_addr32[1] == htonl(0x00000000) && + daddr->s6_addr32[2] == htonl(0x00000001) && + daddr->s6_addr [12] == 0xff )) { + if (net_ratelimit()) + printk(KERN_DEBUG "ICMP6 NS: bad DAD packet (wrong destination\n"); + return; + } + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { if (net_ratelimit()) printk(KERN_WARNING "ICMP NS: invalid ND option, ignored.\n"); @@ -637,23 +725,20 @@ printk(KERN_WARNING "ICMP NS: bad lladdr length.\n"); return; } - } - /* XXX: RFC2461 7.1.1: - * If the IP source address is the unspecified address, there - * MUST NOT be source link-layer address option in the message. - * - * NOTE! Linux kernel < 2.4.4 broke this rule. - */ - - /* XXX: RFC2461 7.1.1: - * If the IP source address is the unspecified address, the IP - * destination address MUST be a solicited-node multicast address. - */ + /* XXX: RFC2461 7.1.1: + * If the IP source address is the unspecified address, + * there MUST NOT be source link-layer address option + * in the message. + */ + if (addr_type == IPV6_ADDR_ANY) { + if (net_ratelimit()) + printk(KERN_WARNING "ICMP6 NS: bad DAD packet (link-layer address option)\n"); + return; + } + } if ((ifp = ipv6_get_ifaddr(&msg->target, dev)) != NULL) { - int addr_type = ipv6_addr_type(saddr); - if (ifp->flags & IFA_F_TENTATIVE) { /* Address is tentative. If the source is unspecified address, it is someone @@ -686,8 +771,7 @@ ipv6_addr_all_nodes(&maddr); ndisc_send_na(dev, NULL, &maddr, &ifp->addr, ifp->idev->cnf.forwarding, 0, - ipv6_addr_type(&ifp->addr)&IPV6_ADDR_ANYCAST ? 0 : 1, - 1); + 1, 1); in6_ifa_put(ifp); return; } @@ -710,8 +794,7 @@ if (neigh || !dev->hard_header) { ndisc_send_na(dev, neigh, saddr, &ifp->addr, ifp->idev->cnf.forwarding, 1, - ipv6_addr_type(&ifp->addr)&IPV6_ADDR_ANYCAST ? 0 : 1, - 1); + 1, 1); if (neigh) neigh_release(neigh); } @@ -719,7 +802,6 @@ in6_ifa_put(ifp); } else if (ipv6_chk_acast_addr(dev, &msg->target)) { struct inet6_dev *idev = in6_dev_get(dev); - int addr_type = ipv6_addr_type(saddr); /* anycast */ @@ -763,10 +845,10 @@ in6_dev_put(idev); } else { struct inet6_dev *in6_dev = in6_dev_get(dev); - int addr_type = ipv6_addr_type(saddr); if (in6_dev && in6_dev->cnf.forwarding && - (addr_type & IPV6_ADDR_UNICAST) && + (addr_type & IPV6_ADDR_UNICAST || + addr_type == IPV6_ADDR_ANY) && pneigh_lookup(&nd_tbl, &msg->target, dev, 0)) { int inc = ipv6_addr_type(daddr)&IPV6_ADDR_MULTICAST; @@ -779,12 +861,20 @@ else nd_tbl.stats.rcv_probes_ucast++; - neigh = neigh_event_ns(&nd_tbl, lladdr, saddr, dev); + if (addr_type & IPV6_ADDR_UNICAST) { + neigh = neigh_event_ns(&nd_tbl, lladdr, saddr, dev); - if (neigh) { - ndisc_send_na(dev, neigh, saddr, &msg->target, - 0, 1, 0, 1); - neigh_release(neigh); + if (neigh) { + ndisc_send_na(dev, neigh, saddr, &msg->target, + 0, 1, 0, 1); + neigh_release(neigh); + } + } else { + /* proxy should also protect against DAD */ + struct in6_addr maddr; + ipv6_addr_all_nodes(&maddr); + ndisc_send_na(dev, NULL, &maddr, &msg->target, + 0, 0, 0, 1); } } else { struct sk_buff *n = skb_clone(skb, GFP_ATOMIC); @@ -800,7 +890,7 @@ return; } -void ndisc_recv_na(struct sk_buff *skb) +static void ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb->h.raw; struct in6_addr *saddr = &skb->nh.ipv6h->saddr; @@ -870,12 +960,8 @@ */ struct rt6_info *rt; rt = rt6_get_dflt_router(saddr, dev); - if (rt) { - /* It is safe only because - we aer in BH */ - dst_release(&rt->u.dst); - ip6_del_rt(rt, NULL); - } + if (rt) + ip6_del_rt(rt, NULL, NULL); } } else { if (msg->icmph.icmp6_router) @@ -960,7 +1046,7 @@ rt = rt6_get_dflt_router(&skb->nh.ipv6h->saddr, skb->dev); if (rt && lifetime == 0) { - ip6_del_rt(rt, NULL); + ip6_del_rt(rt, NULL, NULL); rt = NULL; } @@ -1072,7 +1158,7 @@ in6_dev->cnf.mtu6 = mtu; if (rt) - rt->u.dst.pmtu = mtu; + rt->u.dst.metrics[RTAX_MTU-1] = mtu; rt6_mtu_change(skb->dev, mtu); } @@ -1195,27 +1281,44 @@ struct in6_addr *addrp; struct net_device *dev; struct rt6_info *rt; + struct dst_entry *dst; + struct flowi fl; u8 *opt; int rd_len; int err; int hlen; dev = skb->dev; - rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev->ifindex, 1); + if (ipv6_get_lladdr(dev, &saddr_buf)) { + ND_PRINTK1("redirect: no link_local addr for dev\n"); + return; + } + + ndisc_flow_init(&fl, NDISC_REDIRECT, &saddr_buf, &skb->nh.ipv6h->saddr); + + rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev->ifindex, 1); if (rt == NULL) return; + dst = &rt->u.dst; + + err = xfrm_lookup(&dst, &fl, NULL, 0); + if (err) { + dst_release(dst); + return; + } + + rt = (struct rt6_info *) dst; if (rt->rt6i_flags & RTF_GATEWAY) { ND_PRINTK1("ndisc_send_redirect: not a neighbour\n"); - dst_release(&rt->u.dst); + dst_release(dst); return; } - if (!xrlim_allow(&rt->u.dst, 1*HZ)) { - dst_release(&rt->u.dst); + if (!xrlim_allow(dst, 1*HZ)) { + dst_release(dst); return; } - dst_release(&rt->u.dst); if (dev->addr_len) { if (neigh->nud_state&NUD_VALID) { @@ -1225,6 +1328,7 @@ We will make it later, when will be sure, that it is alive. */ + dst_release(dst); return; } } @@ -1234,11 +1338,6 @@ rd_len &= ~0x7; len += rd_len; - if (ipv6_get_lladdr(dev, &saddr_buf)) { - ND_PRINTK1("redirect: no link_local addr for dev\n"); - return; - } - buff = sock_alloc_send_skb(sk, MAX_HEADER + len + dev->hard_header_len + 15, 1, &err); if (buff == NULL) { @@ -1248,15 +1347,11 @@ hlen = 0; - if (ndisc_build_ll_hdr(buff, dev, &skb->nh.ipv6h->saddr, NULL, len) == 0) { - kfree_skb(buff); - return; - } - + skb_reserve(buff, (dev->hard_header_len + 15) & ~15); ip6_nd_hdr(sk, buff, dev, &saddr_buf, &skb->nh.ipv6h->saddr, IPPROTO_ICMPV6, len); - icmph = (struct icmp6hdr *) skb_put(buff, len); + buff->h.raw = (unsigned char*) icmph = (struct icmp6hdr *) skb_put(buff, len); memset(icmph, 0, sizeof(struct icmp6hdr)); icmph->icmp6_type = NDISC_REDIRECT; @@ -1294,7 +1389,8 @@ len, IPPROTO_ICMPV6, csum_partial((u8 *) icmph, len, 0)); - dev_queue_xmit(buff); + buff->dst = dst; + dst_output(buff); ICMP6_INC_STATS(Icmp6OutRedirects); ICMP6_INC_STATS(Icmp6OutMsgs); @@ -1414,6 +1510,9 @@ void ndisc_cleanup(void) { +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&nd_tbl.parms); +#endif neigh_table_clear(&nd_tbl); sock_release(ndisc_socket); ndisc_socket = NULL; /* For safety. */ Index: net/ipv6/protocol.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/protocol.c,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/net/ipv6/protocol.c 20 May 2001 00:56:43 -0000 1.1.1.15 +++ b/net/ipv6/protocol.c 16 Apr 2004 13:16:26 -0000 1.1.1.15.2.1 @@ -42,77 +42,42 @@ struct inet6_protocol *inet6_protos[MAX_INET_PROTOS]; -void inet6_add_protocol(struct inet6_protocol *prot) +int inet6_add_protocol(struct inet6_protocol *prot, unsigned char protocol) { - unsigned char hash; - struct inet6_protocol *p2; + int ret, hash = protocol & (MAX_INET_PROTOS - 1); - hash = prot->protocol & (MAX_INET_PROTOS - 1); br_write_lock_bh(BR_NETPROTO_LOCK); - prot->next = inet6_protos[hash]; - inet6_protos[hash] = prot; - prot->copy = 0; - - /* - * Set the copy bit if we need to. - */ - - p2 = (struct inet6_protocol *) prot->next; - while(p2 != NULL) { - if (p2->protocol == prot->protocol) { - prot->copy = 1; - break; - } - p2 = (struct inet6_protocol *) p2->next; + + if (inet6_protos[hash]) { + ret = -1; + } else { + inet6_protos[hash] = prot; + ret = 0; } + br_write_unlock_bh(BR_NETPROTO_LOCK); + + return ret; } /* * Remove a protocol from the hash tables. */ -int inet6_del_protocol(struct inet6_protocol *prot) +int inet6_del_protocol(struct inet6_protocol *prot, unsigned char protocol) { - struct inet6_protocol *p; - struct inet6_protocol *lp = NULL; - unsigned char hash; + int ret, hash = protocol & (MAX_INET_PROTOS - 1); - hash = prot->protocol & (MAX_INET_PROTOS - 1); br_write_lock_bh(BR_NETPROTO_LOCK); - if (prot == inet6_protos[hash]) { - inet6_protos[hash] = (struct inet6_protocol *) inet6_protos[hash]->next; - br_write_unlock_bh(BR_NETPROTO_LOCK); - return(0); - } - - p = (struct inet6_protocol *) inet6_protos[hash]; - if (p != NULL && p->protocol == prot->protocol) - lp = p; - - while(p != NULL) { - /* - * We have to worry if the protocol being deleted is - * the last one on the list, then we may need to reset - * someone's copied bit. - */ - if (p->next != NULL && p->next == prot) { - /* - * if we are the last one with this protocol and - * there is a previous one, reset its copy bit. - */ - if (prot->copy == 0 && lp != NULL) - lp->copy = 0; - p->next = prot->next; - br_write_unlock_bh(BR_NETPROTO_LOCK); - return(0); - } - if (p->next != NULL && p->next->protocol == prot->protocol) - lp = p->next; - - p = (struct inet6_protocol *) p->next; + if (inet6_protos[hash] != prot) { + ret = -1; + } else { + inet6_protos[hash] = NULL; + ret = 0; } + br_write_unlock_bh(BR_NETPROTO_LOCK); - return(-1); + + return ret; } Index: net/ipv6/raw.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/raw.c,v retrieving revision 1.1.1.25 retrieving revision 1.1.1.25.2.1 diff -u -r1.1.1.25 -r1.1.1.25.2.1 --- a/net/ipv6/raw.c 28 Nov 2003 18:26:21 -0000 1.1.1.25 +++ b/net/ipv6/raw.c 16 Apr 2004 13:16:26 -0000 1.1.1.25.2.1 @@ -12,6 +12,7 @@ * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI,H.@USAGI : raw checksum (RFC2292(bis) compliance) + * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -29,6 +30,8 @@ #include #include #include +#include +#include #include #include @@ -45,6 +48,7 @@ #include #include +#include struct sock *raw_v6_htable[RAWV6_HTABLE_SIZE]; rwlock_t raw_v6_lock = RW_LOCK_UNLOCKED; @@ -133,12 +137,14 @@ * demultiplex raw sockets. * (should consider queueing the skb in the sock receive_queue * without calling rawv6.c) + * + * Caller owns SKB so we must make clones. */ -struct sock * ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) +void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct in6_addr *saddr; struct in6_addr *daddr; - struct sock *sk, *sk2; + struct sock *sk; __u8 hash; saddr = &skb->nh.ipv6h->saddr; @@ -159,30 +165,18 @@ sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr); - if (sk) { - sk2 = sk; - - while ((sk2 = __raw_v6_lookup(sk2->next, nexthdr, daddr, saddr))) { - struct sk_buff *buff; - - if (nexthdr == IPPROTO_ICMPV6 && - icmpv6_filter(sk2, skb)) - continue; - - buff = skb_clone(skb, GFP_ATOMIC); - if (buff) - rawv6_rcv(sk2, buff); + while (sk) { + if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) { + struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); + + /* Not releasing hash table! */ + if (clone) + rawv6_rcv(sk, clone); } + sk = __raw_v6_lookup(sk->next, nexthdr, daddr, saddr); } - - if (sk && nexthdr == IPPROTO_ICMPV6 && icmpv6_filter(sk, skb)) - sk = NULL; - out: - if (sk) - sock_hold(sk); read_unlock(&raw_v6_lock); - return sk; } /* This cleans up af_inet6 a bit. -DaveM */ @@ -309,6 +303,11 @@ */ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) { + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return NET_RX_DROP; + } + if (!sk->tp_pinfo.tp_raw.checksum) skb->ip_summed = CHECKSUM_UNNECESSARY; @@ -434,86 +433,114 @@ goto out_free; } -/* - * Sending... - */ +static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct raw6_opt *opt, int len) +{ + struct sk_buff *skb; + int err = 0; + u16 *csum; -struct rawv6_fakehdr { - struct iovec *iov; - struct sock *sk; - __u32 len; - __u32 cksum; - __u32 proto; - struct in6_addr *daddr; -}; + if ((skb = skb_peek(&sk->write_queue)) == NULL) + goto out; -static int rawv6_getfrag(const void *data, struct in6_addr *saddr, - char *buff, unsigned int offset, unsigned int len) -{ - struct iovec *iov = (struct iovec *) data; + if (opt->offset + 1 < len) + csum = (u16 *)(skb->h.raw + opt->offset); + else { + err = -EINVAL; + goto out; + } - return memcpy_fromiovecend(buff, iov, offset, len); + if (skb_queue_len(&sk->write_queue) == 1) { + /* + * Only one fragment on the socket. + */ + /* should be check HW csum miyazawa */ + *csum = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + len, fl->proto, skb->csum); + } else { + u32 tmp_csum = 0; + + skb_queue_walk(&sk->write_queue, skb) { + tmp_csum = csum_add(tmp_csum, skb->csum); + } + + tmp_csum = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + len, fl->proto, tmp_csum); + *csum = tmp_csum; + } + if (*csum == 0) + *csum = -1; + ip6_push_pending_frames(sk); +out: + return err; } -static int rawv6_frag_cksum(const void *data, struct in6_addr *addr, - char *buff, unsigned int offset, - unsigned int len) +static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, + struct flowi *fl, struct rt6_info *rt, + unsigned int flags) { - struct rawv6_fakehdr *hdr = (struct rawv6_fakehdr *) data; - - if (csum_partial_copy_fromiovecend(buff, hdr->iov, offset, - len, &hdr->cksum)) - return -EFAULT; - - if (offset == 0) { - struct sock *sk; - struct raw6_opt *opt; - struct in6_addr *daddr; - - sk = hdr->sk; - opt = &sk->tp_pinfo.tp_raw; + struct inet_opt *inet = inet_sk(sk); + struct ipv6hdr *iph; + struct sk_buff *skb; + unsigned int hh_len; + int err; - if (hdr->daddr) - daddr = hdr->daddr; - else - daddr = addr + 1; - - hdr->cksum = csum_ipv6_magic(addr, daddr, hdr->len, - hdr->proto, hdr->cksum); - - if (opt->offset + 1 < len) { - __u16 *csum; + if (length > rt->u.dst.dev->mtu) { + ipv6_local_error(sk, EMSGSIZE, fl, rt->u.dst.dev->mtu); + return -EMSGSIZE; + } + if (flags&MSG_PROBE) + goto out; - csum = (__u16 *) (buff + opt->offset); - if (*csum) { - /* in case cksum was not initialized */ - __u32 sum = hdr->cksum; - sum += *csum; - *csum = hdr->cksum = (sum + (sum>>16)); - } else { - *csum = hdr->cksum; - } - } else { - if (net_ratelimit()) - printk(KERN_DEBUG "icmp: cksum offset too big\n"); - return -EINVAL; - } - } - return 0; -} + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + + skb = sock_alloc_send_skb(sk, length+hh_len+15, + flags&MSG_DONTWAIT, &err); + if (skb == NULL) + goto error; + skb_reserve(skb, hh_len); + + skb->priority = sk->priority; + skb->dst = dst_clone(&rt->u.dst); + + skb->nh.ipv6h = iph = (struct ipv6hdr *)skb_put(skb, length); + + skb->ip_summed = CHECKSUM_NONE; + skb->h.raw = skb->nh.raw; + err = memcpy_fromiovecend((void *)iph, from, 0, length); + if (err) + goto error_fault; + + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + dst_output); + if (err > 0) + err = inet->recverr ? net_xmit_errno(err) : 0; + if (err) + goto error; +out: + return 0; +error_fault: + err = -EFAULT; + kfree_skb(skb); +error: + IP6_INC_STATS(Ip6OutDiscards); + return err; +} static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) { struct ipv6_txoptions opt_space; struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name; + struct in6_addr *daddr; struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + struct raw6_opt *raw_opt = raw6_sk(sk); struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; + struct dst_entry *dst = NULL; struct flowi fl; int addr_len = msg->msg_namelen; - struct in6_addr *daddr; - struct raw6_opt *raw_opt; int hlimit = -1; u16 proto; int err; @@ -531,9 +558,7 @@ /* * Get and verify the address. */ - - fl.fl6_flowlabel = 0; - fl.oif = 0; + memset(&fl, 0, sizeof(fl)); if (sin6) { if (addr_len < SIN6_LEN_RFC2133) @@ -547,6 +572,8 @@ if (!proto) proto = sk->num; + else if (proto != sk->num) + return(-EINVAL); if (proto > 255) return(-EINVAL); @@ -585,16 +612,17 @@ * unspecfied destination address * treated as error... is this correct ? */ + fl6_sock_release(flowlabel); return(-EINVAL); } if (fl.oif == 0) fl.oif = sk->bound_dev_if; - fl.fl6_src = NULL; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); + opt->tot_len = sizeof(struct ipv6_txoptions); err = datagram_send_ctl(msg, &fl, opt, &hlimit); if (err < 0) { @@ -614,39 +642,71 @@ if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); - raw_opt = &sk->tp_pinfo.tp_raw; - fl.proto = proto; - fl.fl6_dst = daddr; - if (fl.fl6_src == NULL && !ipv6_addr_any(&np->saddr)) - fl.fl6_src = &np->saddr; - fl.uli_u.icmpt.type = 0; - fl.uli_u.icmpt.code = 0; - - if (raw_opt->checksum) { - struct rawv6_fakehdr hdr; - - hdr.iov = msg->msg_iov; - hdr.sk = sk; - hdr.len = len; - hdr.cksum = 0; - hdr.proto = proto; + ipv6_addr_copy(&fl.fl6_dst, daddr); + if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + + /* merge ip6_build_xmit from ip6_output */ + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + } + + if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) + fl.oif = np->mcast_oif; + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto out; - if (opt && opt->srcrt) - hdr.daddr = daddr; + if (hlimit < 0) { + if (ipv6_addr_is_multicast(&fl.fl6_dst)) + hlimit = np->mcast_hops; else - hdr.daddr = NULL; + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + } + + if (msg->msg_flags&MSG_CONFIRM) + goto do_confirm; - err = ip6_build_xmit(sk, rawv6_frag_cksum, &hdr, &fl, len, - opt, hlimit, msg->msg_flags); +back_from_confirm: + if (sk->protinfo.af_inet.hdrincl) { + err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl, (struct rt6_info*)dst, msg->msg_flags); } else { - err = ip6_build_xmit(sk, rawv6_getfrag, msg->msg_iov, &fl, len, - opt, hlimit, msg->msg_flags); + lock_sock(sk); + err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, + hlimit, opt, &fl, (struct rt6_info*)dst, msg->msg_flags); + + if (err) + ip6_flush_pending_frames(sk); + else if (!(msg->msg_flags & MSG_MORE)) { + if (raw_opt->checksum) { + err = rawv6_push_pending_frames(sk, &fl, raw_opt, len); + } else { + err = ip6_push_pending_frames(sk); + } + } } +done: + ip6_dst_store(sk, dst, + !ipv6_addr_cmp(&fl.fl6_dst, &np->daddr) ? + &np->daddr : NULL); + if (err > 0) + err = np->recverr ? net_xmit_errno(err) : 0; + release_sock(sk); +out: fl6_sock_release(flowlabel); - return err<0?err:len; +do_confirm: + dst_confirm(dst); + if (!(msg->msg_flags & MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto done; } static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, Index: net/ipv6/reassembly.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/reassembly.c,v retrieving revision 1.1.1.21 retrieving revision 1.1.1.21.2.1 diff -u -r1.1.1.21 -r1.1.1.21.2.1 --- a/net/ipv6/reassembly.c 25 Aug 2003 11:44:44 -0000 1.1.1.21 +++ b/net/ipv6/reassembly.c 16 Apr 2004 13:16:26 -0000 1.1.1.21.2.1 @@ -23,6 +23,10 @@ * Horst von Brand Add missing #include * Alexey Kuznetsov SMP races, threading, cleanup. * Patrick McHardy LRU queue of frag heads for evictor. + * Mitsuru KANDA @USAGI Register inet6_protocol{}. + * David Stevens and + * YOSHIFUJI,H. @USAGI Always remove fragment header to + * calculate ICV correctly. */ #include #include @@ -421,7 +425,7 @@ end = offset + (ntohs(skb->nh.ipv6h->payload_len) - ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1))); - if ((unsigned int)end >= 65536) { + if ((unsigned int)end > IPV6_MAXPLEN) { icmpv6_param_prob(skb,ICMPV6_HDR_FIELD, (u8*)&fhdr->frag_off - skb->nh.raw); return; } @@ -431,7 +435,7 @@ csum_partial(skb->nh.raw, (u8*)(fhdr+1)-skb->nh.raw, 0)); /* Is this the final fragment? */ - if (!(fhdr->frag_off & htons(0x0001))) { + if (!(fhdr->frag_off & htons(IP6_MF))) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ @@ -579,12 +583,12 @@ * the last and the first frames arrived and all the bits are here. */ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in, + unsigned int *nhoffp, struct net_device *dev) { struct sk_buff *fp, *head = fq->fragments; - int remove_fraghdr = 0; int payload_len; - int nhoff; + unsigned int nhoff; fq_kill(fq); @@ -592,15 +596,9 @@ BUG_TRAP(FRAG6_CB(head)->offset == 0); /* Unfragmented part is taken from the first segment. */ - payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len; - nhoff = head->h.raw - head->nh.raw; - - if (payload_len > 65535) { - payload_len -= 8; - if (payload_len > 65535) - goto out_oversize; - remove_fraghdr = 1; - } + payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr); + if (payload_len > IPV6_MAXPLEN) + goto out_oversize; /* Head of list must not be cloned. */ if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) @@ -629,18 +627,14 @@ atomic_add(clone->truesize, &ip6_frag_mem); } - /* Normally we do not remove frag header from datagram, but - * we have to do this and to relocate header, when payload - * is > 65535-8. */ - if (remove_fraghdr) { - nhoff = fq->nhoffset; - head->nh.raw[nhoff] = head->h.raw[0]; - memmove(head->head+8, head->head, (head->data-head->head)-8); - head->mac.raw += 8; - head->nh.raw += 8; - } else { - ((struct frag_hdr*)head->h.raw)->frag_off = 0; - } + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ + nhoff = fq->nhoffset; + head->nh.raw[nhoff] = head->h.raw[0]; + memmove(head->head + sizeof(struct frag_hdr), head->head, + (head->data - head->head) - sizeof(struct frag_hdr)); + head->mac.raw += sizeof(struct frag_hdr); + head->nh.raw += sizeof(struct frag_hdr); skb_shinfo(head)->frag_list = head->next; head->h.raw = head->data; @@ -671,7 +665,8 @@ IP6_INC_STATS_BH(Ip6ReasmOKs); fq->fragments = NULL; - return nhoff; + *nhoffp = nhoff; + return 1; out_oversize: if (net_ratelimit()) @@ -685,7 +680,7 @@ return -1; } -int ipv6_reassembly(struct sk_buff **skbp, int nhoff) +static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp) { struct sk_buff *skb = *skbp; struct net_device *dev = skb->dev; @@ -715,7 +710,8 @@ skb->h.raw += sizeof(struct frag_hdr); IP6_INC_STATS_BH(Ip6ReasmOKs); - return (u8*)fhdr - skb->nh.raw; + *nhoffp = (u8*)fhdr - skb->nh.raw; + return 1; } if (atomic_read(&ip6_frag_mem) > sysctl_ip6frag_high_thresh) @@ -726,11 +722,11 @@ spin_lock(&fq->lock); - ip6_frag_queue(fq, skb, fhdr, nhoff); + ip6_frag_queue(fq, skb, fhdr, *nhoffp); if (fq->last_in == (FIRST_IN|LAST_IN) && fq->meat == fq->len) - ret = ip6_frag_reasm(fq, skbp, dev); + ret = ip6_frag_reasm(fq, skbp, nhoffp, dev); spin_unlock(&fq->lock); fq_put(fq); @@ -742,8 +738,17 @@ return -1; } +static struct inet6_protocol frag_protocol = +{ + .handler = ipv6_frag_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + void __init ipv6_frag_init(void) { + if (inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT) < 0) + printk(KERN_ERR "ipv6_frag_init: Could not register protocol\n"); + ip6_frag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ (jiffies ^ (jiffies >> 6))); Index: net/ipv6/route.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/route.c,v retrieving revision 1.1.1.27 retrieving revision 1.1.1.27.2.1 diff -u -r1.1.1.27 -r1.1.1.27.2.1 --- a/net/ipv6/route.c 18 Feb 2004 13:36:32 -0000 1.1.1.27 +++ b/net/ipv6/route.c 16 Apr 2004 13:16:26 -0000 1.1.1.27.2.1 @@ -49,6 +49,8 @@ #include #include #include +#include +#include #include @@ -56,8 +58,6 @@ #include #endif -#undef CONFIG_RT6_POLICY - /* Set to 3 to get tracing. */ #define RT6_DEBUG 2 @@ -80,39 +80,43 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); -static struct dst_entry *ip6_dst_reroute(struct dst_entry *dst, - struct sk_buff *skb); static struct dst_entry *ip6_negative_advice(struct dst_entry *); static int ip6_dst_gc(void); static int ip6_pkt_discard(struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); +static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu); struct dst_ops ip6_dst_ops = { - AF_INET6, - __constant_htons(ETH_P_IPV6), - 1024, - - ip6_dst_gc, - ip6_dst_check, - ip6_dst_reroute, - NULL, - ip6_negative_advice, - ip6_link_failure, - sizeof(struct rt6_info), + .family = AF_INET6, + .protocol = __constant_htons(ETH_P_IPV6), + .gc = ip6_dst_gc, + .gc_thresh = 1024, + .check = ip6_dst_check, + .negative_advice = ip6_negative_advice, + .link_failure = ip6_link_failure, + .update_pmtu = ip6_rt_update_pmtu, + .entry_size = sizeof(struct rt6_info), }; struct rt6_info ip6_null_entry = { - {{NULL, ATOMIC_INIT(1), 1, &loopback_dev, - -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -ENETUNREACH, NULL, NULL, - ip6_pkt_discard, ip6_pkt_discard, -#ifdef CONFIG_NET_CLS_ROUTE - 0, -#endif - &ip6_dst_ops}}, - NULL, {{{0}}}, RTF_REJECT|RTF_NONEXTHOP, ~0U, - 255, ATOMIC_INIT(1), {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0} + .u = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .dev = &loopback_dev, + .obsolete = -1, + .error = -ENETUNREACH, + .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, + .input = ip6_pkt_discard, + .output = ip6_pkt_discard, + .ops = &ip6_dst_ops, + .path = (struct dst_entry*)&ip6_null_entry, + } + }, + .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_metric = ~(u32) 0, + .rt6i_ref = ATOMIC_INIT(1), }; struct fib6_node ip6_routing_table = { @@ -121,29 +125,17 @@ 0, RTN_ROOT|RTN_TL_ROOT|RTN_RTINFO, 0 }; -#ifdef CONFIG_RT6_POLICY -int ip6_rt_policy = 0; - -struct pol_chain *rt6_pol_list = NULL; - - -static int rt6_flow_match_in(struct rt6_info *rt, struct sk_buff *skb); -static int rt6_flow_match_out(struct rt6_info *rt, struct sock *sk); - -static struct rt6_info *rt6_flow_lookup(struct rt6_info *rt, - struct in6_addr *daddr, - struct in6_addr *saddr, - struct fl_acc_args *args); - -#else -#define ip6_rt_policy (0) -#endif - /* Protects all the ip6 fib */ rwlock_t rt6_lock = RW_LOCK_UNLOCKED; +/* allocate dst with ip6_dst_ops */ +static __inline__ struct rt6_info *ip6_dst_alloc(void) +{ + return dst_alloc(&ip6_dst_ops); +} + /* * Route lookup. Any rt6_lock is implied. */ @@ -269,9 +261,12 @@ } } - if (match) + if (match) { + if (rt6_dflt_pointer != match) + RT6_TRACE("changed default router: %p->%p\n", + rt6_dflt_pointer, match); rt6_dflt_pointer = match; - + } spin_unlock(&rt6_dflt_lock); if (!match) { @@ -325,12 +320,12 @@ be destroyed. */ -static int rt6_ins(struct rt6_info *rt, struct nlmsghdr *nlh) +static int rt6_ins(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) { int err; write_lock_bh(&rt6_lock); - err = fib6_add(&ip6_routing_table, rt, nlh); + err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr); write_unlock_bh(&rt6_lock); return err; @@ -373,7 +368,7 @@ dst_hold(&rt->u.dst); - err = rt6_ins(rt, NULL); + err = rt6_ins(rt, NULL, NULL); if (err == 0) return rt; @@ -385,38 +380,6 @@ return &ip6_null_entry; } -#ifdef CONFIG_RT6_POLICY -static __inline__ struct rt6_info *rt6_flow_lookup_in(struct rt6_info *rt, - struct sk_buff *skb) -{ - struct in6_addr *daddr, *saddr; - struct fl_acc_args arg; - - arg.type = FL_ARG_FORWARD; - arg.fl_u.skb = skb; - - saddr = &skb->nh.ipv6h->saddr; - daddr = &skb->nh.ipv6h->daddr; - - return rt6_flow_lookup(rt, daddr, saddr, &arg); -} - -static __inline__ struct rt6_info *rt6_flow_lookup_out(struct rt6_info *rt, - struct sock *sk, - struct flowi *fl) -{ - struct fl_acc_args arg; - - arg.type = FL_ARG_ORIGIN; - arg.fl_u.fl_o.sk = sk; - arg.fl_u.fl_o.flow = fl; - - return rt6_flow_lookup(rt, fl->nl_u.ip6_u.daddr, fl->nl_u.ip6_u.saddr, - &arg); -} - -#endif - #define BACKTRACK() \ if (rt == &ip6_null_entry && strict) { \ while ((fn = fn->parent) != NULL) { \ @@ -449,53 +412,30 @@ rt = fn->leaf; if ((rt->rt6i_flags & RTF_CACHE)) { - if (ip6_rt_policy == 0) { - rt = rt6_device_match(rt, skb->dev->ifindex, strict); - BACKTRACK(); - dst_hold(&rt->u.dst); - goto out; - } - -#ifdef CONFIG_RT6_POLICY - if ((rt->rt6i_flags & RTF_FLOW)) { - struct rt6_info *sprt; - - for (sprt = rt; sprt; sprt = sprt->u.next) { - if (rt6_flow_match_in(sprt, skb)) { - rt = sprt; - dst_hold(&rt->u.dst); - goto out; - } - } - } -#endif + rt = rt6_device_match(rt, skb->dev->ifindex, strict); + BACKTRACK(); + dst_hold(&rt->u.dst); + goto out; } rt = rt6_device_match(rt, skb->dev->ifindex, 0); BACKTRACK(); - if (ip6_rt_policy == 0) { - if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { - read_unlock_bh(&rt6_lock); + if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { + read_unlock_bh(&rt6_lock); - rt = rt6_cow(rt, &skb->nh.ipv6h->daddr, - &skb->nh.ipv6h->saddr); + rt = rt6_cow(rt, &skb->nh.ipv6h->daddr, + &skb->nh.ipv6h->saddr); - if (rt->u.dst.error != -EEXIST || --attempts <= 0) - goto out2; - /* Race condition! In the gap, when rt6_lock was - released someone could insert this route. Relookup. - */ - goto relookup; - } - dst_hold(&rt->u.dst); - } else { -#ifdef CONFIG_RT6_POLICY - rt = rt6_flow_lookup_in(rt, skb); -#else - /* NEVER REACHED */ -#endif + if (rt->u.dst.error != -EEXIST || --attempts <= 0) + goto out2; + /* Race condition! In the gap, when rt6_lock was + released someone could insert this route. Relookup. + */ + dst_release(&rt->u.dst); + goto relookup; } + dst_hold(&rt->u.dst); out: read_unlock_bh(&rt6_lock); @@ -512,38 +452,21 @@ int strict; int attempts = 3; - strict = ipv6_addr_type(fl->nl_u.ip6_u.daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL); + strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL); relookup: read_lock_bh(&rt6_lock); - fn = fib6_lookup(&ip6_routing_table, fl->nl_u.ip6_u.daddr, - fl->nl_u.ip6_u.saddr); + fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src); restart: rt = fn->leaf; if ((rt->rt6i_flags & RTF_CACHE)) { - if (ip6_rt_policy == 0) { - rt = rt6_device_match(rt, fl->oif, strict); - BACKTRACK(); - dst_hold(&rt->u.dst); - goto out; - } - -#ifdef CONFIG_RT6_POLICY - if ((rt->rt6i_flags & RTF_FLOW)) { - struct rt6_info *sprt; - - for (sprt = rt; sprt; sprt = sprt->u.next) { - if (rt6_flow_match_out(sprt, sk)) { - rt = sprt; - dst_hold(&rt->u.dst); - goto out; - } - } - } -#endif + rt = rt6_device_match(rt, fl->oif, strict); + BACKTRACK(); + dst_hold(&rt->u.dst); + goto out; } if (rt->rt6i_flags & RTF_DEFAULT) { if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF) @@ -553,29 +476,21 @@ BACKTRACK(); } - if (ip6_rt_policy == 0) { - if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { - read_unlock_bh(&rt6_lock); + if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { + read_unlock_bh(&rt6_lock); - rt = rt6_cow(rt, fl->nl_u.ip6_u.daddr, - fl->nl_u.ip6_u.saddr); - - if (rt->u.dst.error != -EEXIST || --attempts <= 0) - goto out2; + rt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src); - /* Race condition! In the gap, when rt6_lock was - released someone could insert this route. Relookup. - */ - goto relookup; - } - dst_hold(&rt->u.dst); - } else { -#ifdef CONFIG_RT6_POLICY - rt = rt6_flow_lookup_out(rt, sk, fl); -#else - /* NEVER REACHED */ -#endif + if (rt->u.dst.error != -EEXIST || --attempts <= 0) + goto out2; + + /* Race condition! In the gap, when rt6_lock was + released someone could insert this route. Relookup. + */ + dst_release(&rt->u.dst); + goto relookup; } + dst_hold(&rt->u.dst); out: read_unlock_bh(&rt6_lock); @@ -603,23 +518,13 @@ return NULL; } -static struct dst_entry *ip6_dst_reroute(struct dst_entry *dst, struct sk_buff *skb) -{ - /* - * FIXME - */ - RDBG(("ip6_dst_reroute(%p,%p)[%p] (AIEEE)\n", dst, skb, - __builtin_return_address(0))); - return NULL; -} - static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *) dst; if (rt) { if (rt->rt6i_flags & RTF_CACHE) - ip6_del_rt(rt, NULL); + ip6_del_rt(rt, NULL, NULL); else dst_release(dst); } @@ -642,7 +547,76 @@ } } -static int ip6_dst_gc() +static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct rt6_info *rt6 = (struct rt6_info*)dst; + + if (mtu < dst_pmtu(dst) && rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + dst->metrics[RTAX_MTU-1] = mtu; + } +} + +/* Protected by rt6_lock. */ +static struct dst_entry *ndisc_dst_gc_list; + +struct dst_entry *ndisc_dst_alloc(struct net_device *dev, + struct neighbour *neigh, + int (*output)(struct sk_buff *)) +{ + struct rt6_info *rt = ip6_dst_alloc(); + + if (unlikely(rt == NULL)) + goto out; + + if (dev) + dev_hold(dev); + if (neigh) + neigh_hold(neigh); + + rt->rt6i_dev = dev; + rt->rt6i_nexthop = neigh; + rt->rt6i_expires = 0; + rt->rt6i_flags = RTF_LOCAL | RTF_NDISC; + rt->rt6i_metric = 0; + atomic_set(&rt->u.dst.__refcnt, 1); + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255; + rt->u.dst.output = output; + + write_lock_bh(&rt6_lock); + rt->u.dst.next = ndisc_dst_gc_list; + ndisc_dst_gc_list = &rt->u.dst; + write_unlock_bh(&rt6_lock); + + fib6_force_start_gc(); + +out: + return (struct dst_entry *)rt; +} + +int ndisc_dst_gc(int *more) +{ + struct dst_entry *dst, *next, **pprev; + int freed; + + next = NULL; + pprev = &ndisc_dst_gc_list; + freed = 0; + while ((dst = *pprev) != NULL) { + if (!atomic_read(&dst->__refcnt)) { + *pprev = dst->next; + dst_free(dst); + freed++; + } else { + pprev = &dst->next; + (*more)++; + } + } + + return freed; +} + +static int ip6_dst_gc(void) { static unsigned expire = 30*HZ; static unsigned long last_gc; @@ -669,19 +643,6 @@ Remove it only when all the things will work! */ -static void ipv6_addr_prefix(struct in6_addr *pfx, - const struct in6_addr *addr, int plen) -{ - int b = plen&0x7; - int o = plen>>3; - - memcpy(pfx->s6_addr, addr, o); - if (o < 16) - memset(pfx->s6_addr + o, 0, 16 - o); - if (b != 0) - pfx->s6_addr[o] = addr->s6_addr[o]&(0xff00 >> b); -} - static int ipv6_get_mtu(struct net_device *dev) { int mtu = IPV6_MIN_MTU; @@ -695,6 +656,24 @@ return mtu; } +static inline unsigned int ipv6_advmss(unsigned int mtu) +{ + mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + + if (mtu < ip6_rt_min_advmss) + mtu = ip6_rt_min_advmss; + + /* + * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and + * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. + * IPV6_MAXPLEN is also valid and means: "any MSS, + * rely only on pmtu discovery" + */ + if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) + mtu = IPV6_MAXPLEN; + return mtu; +} + static int ipv6_get_hoplimit(struct net_device *dev) { int hoplimit = ipv6_devconf.hop_limit; @@ -712,14 +691,17 @@ * */ -int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh) +int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) { int err; struct rtmsg *r; + struct rtattr **rta; struct rt6_info *rt; struct net_device *dev = NULL; int addr_type; + rta = (struct rtattr **) _rtattr; + if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128) return -EINVAL; #ifndef CONFIG_IPV6_SUBTREES @@ -729,7 +711,7 @@ if (rtmsg->rtmsg_metric == 0) rtmsg->rtmsg_metric = IP6_RT_PRIO_USER; - rt = dst_alloc(&ip6_dst_ops); + rt = ip6_dst_alloc(); if (rt == NULL) return -ENOMEM; @@ -849,23 +831,42 @@ } } - if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) - rt->rt6i_hoplimit = IPV6_DEFAULT_MCASTHOPS; - else - rt->rt6i_hoplimit = ipv6_get_hoplimit(dev); - rt->rt6i_flags = rtmsg->rtmsg_flags; + rt->rt6i_flags = rtmsg->rtmsg_flags & ~RTF_NDISC; install_route: - rt->u.dst.pmtu = ipv6_get_mtu(dev); - rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.pmtu - 60, ip6_rt_min_advmss); - /* Maximal non-jumbo IPv6 payload is 65535 and corresponding - MSS is 65535 - tcp_header_size. 65535 is also valid and - means: "any MSS, rely only on pmtu discovery" - */ - if (rt->u.dst.advmss > 65535-20) - rt->u.dst.advmss = 65535; + if (rta && rta[RTA_METRICS-1]) { + int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]); + struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > RTAX_MAX) { + err = -EINVAL; + goto out; + } + rt->u.dst.metrics[flavor-1] = + *(u32 *)RTA_DATA(attr); + } + attr = RTA_NEXT(attr, attrlen); + } + } + + if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) { + if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = + IPV6_DEFAULT_MCASTHOPS; + else + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = + ipv6_get_hoplimit(dev); + } + + if (!rt->u.dst.metrics[RTAX_MTU-1]) + rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev); + if (!rt->u.dst.metrics[RTAX_ADVMSS-1]) + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst)); rt->u.dst.dev = dev; - return rt6_ins(rt, nlh); + return rt6_ins(rt, nlh, _rtattr); out: if (dev) @@ -874,7 +875,7 @@ return err; } -int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh) +int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) { int err; @@ -886,13 +887,13 @@ dst_release(&rt->u.dst); - err = fib6_del(rt, nlh); + err = fib6_del(rt, nlh, _rtattr); write_unlock_bh(&rt6_lock); return err; } -int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh) +static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) { struct fib6_node *fn; struct rt6_info *rt; @@ -919,7 +920,7 @@ dst_hold(&rt->u.dst); read_unlock_bh(&rt6_lock); - return ip6_del_rt(rt, nlh); + return ip6_del_rt(rt, nlh, _rtattr); } } read_unlock_bh(&rt6_lock); @@ -1015,17 +1016,14 @@ ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key); nrt->rt6i_nexthop = neigh_clone(neigh); /* Reset pmtu, it may be better */ - nrt->u.dst.pmtu = ipv6_get_mtu(neigh->dev); - nrt->u.dst.advmss = max_t(unsigned int, nrt->u.dst.pmtu - 60, ip6_rt_min_advmss); - if (rt->u.dst.advmss > 65535-20) - rt->u.dst.advmss = 65535; - nrt->rt6i_hoplimit = ipv6_get_hoplimit(neigh->dev); + nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); + nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&nrt->u.dst)); - if (rt6_ins(nrt, NULL)) + if (rt6_ins(nrt, NULL, NULL)) goto out; if (rt->rt6i_flags&RTF_CACHE) { - ip6_del_rt(rt, NULL); + ip6_del_rt(rt, NULL, NULL); return; } @@ -1060,7 +1058,7 @@ if (rt == NULL) return; - if (pmtu >= rt->u.dst.pmtu) + if (pmtu >= dst_pmtu(&rt->u.dst)) goto out; /* New mtu received -> path was valid. @@ -1075,7 +1073,7 @@ would return automatically. */ if (rt->rt6i_flags & RTF_CACHE) { - rt->u.dst.pmtu = pmtu; + rt->u.dst.metrics[RTAX_MTU-1] = pmtu; dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires); rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES; goto out; @@ -1089,7 +1087,7 @@ if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { nrt = rt6_cow(rt, daddr, saddr); if (!nrt->u.dst.error) { - nrt->u.dst.pmtu = pmtu; + nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; /* According to RFC 1981, detecting PMTU increase shouldn't be happened within 5 mins, the recommended timer is 10 mins. Here this route expiration time is set to ip6_rt_mtu_expires @@ -1098,8 +1096,8 @@ */ dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires); nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES; - dst_release(&nrt->u.dst); } + dst_release(&nrt->u.dst); } else { nrt = ip6_rt_copy(rt); if (nrt == NULL) @@ -1110,8 +1108,8 @@ nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop); dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires); nrt->rt6i_flags |= RTF_DYNAMIC|RTF_CACHE|RTF_EXPIRES; - nrt->u.dst.pmtu = pmtu; - rt6_ins(nrt, NULL); + nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; + rt6_ins(nrt, NULL, NULL); } out: @@ -1124,20 +1122,19 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) { - struct rt6_info *rt; + struct rt6_info *rt = ip6_dst_alloc(); - rt = dst_alloc(&ip6_dst_ops); + BUG_ON(ort->rt6i_flags & RTF_NDISC); if (rt) { rt->u.dst.input = ort->u.dst.input; rt->u.dst.output = ort->u.dst.output; - memcpy(&rt->u.dst.mxlock, &ort->u.dst.mxlock, RTAX_MAX*sizeof(unsigned)); + memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); rt->u.dst.dev = ort->u.dst.dev; if (rt->u.dst.dev) dev_hold(rt->u.dst.dev); rt->u.dst.lastuse = jiffies; - rt->rt6i_hoplimit = ort->rt6i_hoplimit; rt->rt6i_expires = 0; ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); @@ -1184,7 +1181,7 @@ rtmsg.rtmsg_ifindex = dev->ifindex; - ip6_route_add(&rtmsg, NULL); + ip6_route_add(&rtmsg, NULL, NULL); return rt6_get_dflt_router(gwaddr, dev); } @@ -1210,7 +1207,7 @@ read_unlock_bh(&rt6_lock); - ip6_del_rt(rt, NULL); + ip6_del_rt(rt, NULL, NULL); goto restart; } @@ -1236,10 +1233,10 @@ rtnl_lock(); switch (cmd) { case SIOCADDRT: - err = ip6_route_add(&rtmsg, NULL); + err = ip6_route_add(&rtmsg, NULL, NULL); break; case SIOCDELRT: - err = ip6_route_del(&rtmsg, NULL); + err = ip6_route_del(&rtmsg, NULL, NULL); break; default: err = -EINVAL; @@ -1268,11 +1265,10 @@ * Add address */ -int ip6_rt_addr_add(struct in6_addr *addr, struct net_device *dev) +int ip6_rt_addr_add(struct in6_addr *addr, struct net_device *dev, int anycast) { - struct rt6_info *rt; + struct rt6_info *rt = ip6_dst_alloc(); - rt = dst_alloc(&ip6_dst_ops); if (rt == NULL) return -ENOMEM; @@ -1280,14 +1276,14 @@ rt->u.dst.input = ip6_input; rt->u.dst.output = ip6_output; rt->rt6i_dev = dev_get_by_name("lo"); - rt->u.dst.pmtu = ipv6_get_mtu(rt->rt6i_dev); - rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.pmtu - 60, ip6_rt_min_advmss); - if (rt->u.dst.advmss > 65535-20) - rt->u.dst.advmss = 65535; - rt->rt6i_hoplimit = ipv6_get_hoplimit(rt->rt6i_dev); + rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst)); + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = ipv6_get_hoplimit(rt->rt6i_dev); rt->u.dst.obsolete = -1; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; + if (!anycast) + rt->rt6i_flags |= RTF_LOCAL; rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); if (rt->rt6i_nexthop == NULL) { dst_free((struct dst_entry *) rt); @@ -1296,7 +1292,7 @@ ipv6_addr_copy(&rt->rt6i_dst.addr, addr); rt->rt6i_dst.plen = 128; - rt6_ins(rt, NULL); + rt6_ins(rt, NULL, NULL); return 0; } @@ -1313,129 +1309,13 @@ rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, 1); if (rt) { if (rt->rt6i_dst.plen == 128) - err = ip6_del_rt(rt, NULL); + err = ip6_del_rt(rt, NULL, NULL); else dst_release(&rt->u.dst); } return err; } - -#ifdef CONFIG_RT6_POLICY - -static int rt6_flow_match_in(struct rt6_info *rt, struct sk_buff *skb) -{ - struct flow_filter *frule; - struct pkt_filter *filter; - int res = 1; - - if ((frule = rt->rt6i_filter) == NULL) - goto out; - - if (frule->type != FLR_INPUT) { - res = 0; - goto out; - } - - for (filter = frule->u.filter; filter; filter = filter->next) { - __u32 *word; - - word = (__u32 *) skb->h.raw; - word += filter->offset; - - if ((*word ^ filter->value) & filter->mask) { - res = 0; - break; - } - } - -out: - return res; -} - -static int rt6_flow_match_out(struct rt6_info *rt, struct sock *sk) -{ - struct flow_filter *frule; - int res = 1; - - if ((frule = rt->rt6i_filter) == NULL) - goto out; - - if (frule->type != FLR_INPUT) { - res = 0; - goto out; - } - - if (frule->u.sk != sk) - res = 0; -out: - return res; -} - -static struct rt6_info *rt6_flow_lookup(struct rt6_info *rt, - struct in6_addr *daddr, - struct in6_addr *saddr, - struct fl_acc_args *args) -{ - struct flow_rule *frule; - struct rt6_info *nrt = NULL; - struct pol_chain *pol; - - for (pol = rt6_pol_list; pol; pol = pol->next) { - struct fib6_node *fn; - struct rt6_info *sprt; - - fn = fib6_lookup(pol->rules, daddr, saddr); - - do { - for (sprt = fn->leaf; sprt; sprt=sprt->u.next) { - int res; - - frule = sprt->rt6i_flowr; -#if RT6_DEBUG >= 2 - if (frule == NULL) { - printk(KERN_DEBUG "NULL flowr\n"); - goto error; - } -#endif - res = frule->ops->accept(rt, sprt, args, &nrt); - - switch (res) { - case FLOWR_SELECT: - goto found; - case FLOWR_CLEAR: - goto next_policy; - case FLOWR_NODECISION: - break; - default: - goto error; - }; - } - - fn = fn->parent; - - } while ((fn->fn_flags & RTN_TL_ROOT) == 0); - - next_policy: - } - -error: - dst_hold(&ip6_null_entry.u.dst); - return &ip6_null_entry; - -found: - if (nrt == NULL) - goto error; - - nrt->rt6i_flags |= RTF_CACHE; - dst_hold(&nrt->u.dst); - err = rt6_ins(nrt, NULL); - if (err) - nrt->u.dst.error = err; - return nrt; -} -#endif - static int fib6_ifdown(struct rt6_info *rt, void *arg) { if (((void*)rt->rt6i_dev == arg || arg == NULL) && @@ -1487,14 +1367,12 @@ PMTU discouvery. */ if (rt->rt6i_dev == arg->dev && - !(rt->u.dst.mxlock&(1<u.dst.pmtu > arg->mtu || - (rt->u.dst.pmtu < arg->mtu && - rt->u.dst.pmtu == idev->cnf.mtu6))) - rt->u.dst.pmtu = arg->mtu; - rt->u.dst.advmss = max_t(unsigned int, arg->mtu - 60, ip6_rt_min_advmss); - if (rt->u.dst.advmss > 65535-20) - rt->u.dst.advmss = 65535; + !dst_metric_locked(&rt->u.dst, RTAX_MTU) && + (dst_pmtu(&rt->u.dst) > arg->mtu || + (dst_pmtu(&rt->u.dst) < arg->mtu && + dst_pmtu(&rt->u.dst) == idev->cnf.mtu6))) + rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu; + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu); return 0; } @@ -1556,7 +1434,7 @@ if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) return -EINVAL; - return ip6_route_del(&rtmsg, nlh); + return ip6_route_del(&rtmsg, nlh, arg); } int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) @@ -1566,7 +1444,7 @@ if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) return -EINVAL; - return ip6_route_add(&rtmsg, nlh); + return ip6_route_add(&rtmsg, nlh, arg); } struct rt6_rtnl_dump_arg @@ -1642,7 +1520,7 @@ if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0) RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); } - if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0) + if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) goto rtattr_failure; if (rt->u.dst.neighbour) RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key); @@ -1798,15 +1676,13 @@ skb->mac.raw = skb->data; skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr)); - fl.proto = 0; - fl.nl_u.ip6_u.daddr = NULL; - fl.nl_u.ip6_u.saddr = NULL; - fl.uli_u.icmpt.type = 0; - fl.uli_u.icmpt.code = 0; + memset(&fl, 0, sizeof(fl)); if (rta[RTA_SRC-1]) - fl.nl_u.ip6_u.saddr = (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]); + ipv6_addr_copy(&fl.fl6_src, + (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1])); if (rta[RTA_DST-1]) - fl.nl_u.ip6_u.daddr = (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]); + ipv6_addr_copy(&fl.fl6_dst, + (struct in6_addr*)RTA_DATA(rta[RTA_DST-1])); if (rta[RTA_IIF-1]) memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int)); @@ -1830,8 +1706,7 @@ NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; err = rt6_fill_node(skb, rt, - fl.nl_u.ip6_u.daddr, - fl.nl_u.ip6_u.saddr, + &fl.fl6_dst, &fl.fl6_src, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, nlh, 0); @@ -2043,7 +1918,6 @@ #endif - void __init ip6_route_init(void) { ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache", @@ -2055,6 +1929,9 @@ proc_net_create("ipv6_route", 0, rt6_proc_info); proc_net_create("rt6_stats", 0, rt6_proc_stats); #endif +#ifdef CONFIG_XFRM + xfrm6_init(); +#endif } #ifdef MODULE @@ -2064,8 +1941,11 @@ proc_net_remove("ipv6_route"); proc_net_remove("rt6_stats"); #endif - +#ifdef CONFIG_XFRM + xfrm6_fini(); +#endif rt6_ifdown(NULL); fib6_gc_cleanup(); + kmem_cache_destroy(ip6_dst_ops.kmem_cachep); } #endif /* MODULE */ Index: net/ipv6/sit.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/sit.c,v retrieving revision 1.1.1.24 retrieving revision 1.1.1.24.2.1 diff -u -r1.1.1.24 -r1.1.1.24.2.1 --- a/net/ipv6/sit.c 28 Nov 2003 18:26:21 -0000 1.1.1.24 +++ b/net/ipv6/sit.c 16 Apr 2004 13:16:26 -0000 1.1.1.24.2.1 @@ -49,6 +49,7 @@ #include #include #include +#include /* This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c @@ -392,6 +393,7 @@ read_lock(&ipip6_lock); if ((tunnel = ipip6_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { + secpath_reset(skb); skb->mac.raw = skb->nh.raw; skb->nh.raw = skb->data; memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); @@ -422,13 +424,6 @@ return 0; } -/* Need this wrapper because NF_HOOK takes the function address */ -static inline int do_ip_send(struct sk_buff *skb) -{ - return ip_send(skb); -} - - /* Returns the embedded IPv4 address if the IPv6 address comes from 6to4 (draft-ietf-ngtrans-6to4-04) addr space */ @@ -501,9 +496,17 @@ dst = addr6->s6_addr32[3]; } - if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { - tunnel->stat.tx_carrier_errors++; - goto tx_error_icmp; + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) } }, + .oif = tunnel->parms.link, + .proto = IPPROTO_IPV6 }; + if (ip_route_output_key(&rt, &fl)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error_icmp; + } } if (rt->rt_type != RTN_UNICAST) { tunnel->stat.tx_carrier_errors++; @@ -518,9 +521,9 @@ } if (tiph->frag_off) - mtu = rt->u.dst.pmtu - sizeof(struct iphdr); + mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr); else - mtu = skb->dst ? skb->dst->pmtu : dev->mtu; + mtu = skb->dst ? dst_pmtu(skb->dst) : dev->mtu; if (mtu < 68) { tunnel->stat.collisions++; @@ -529,15 +532,9 @@ } if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if (skb->dst && mtu < skb->dst->pmtu) { - struct rt6_info *rt6 = (struct rt6_info*)skb->dst; - if (mtu < rt6->u.dst.pmtu) { - if (tunnel->parms.iph.daddr || rt6->rt6i_dst.plen == 128) { - rt6->rt6i_flags |= RTF_MODIFIED; - rt6->u.dst.pmtu = mtu; - } - } - } + if (tunnel->parms.iph.daddr && skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + if (skb->len > mtu) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); ip_rt_put(rt); @@ -555,7 +552,7 @@ /* * Okay, now see if we can stuff it in the buffer as-is. */ - max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr)); + max_headroom = LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr); if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); @@ -776,8 +773,14 @@ ipip6_tunnel_init_gen(dev); if (iph->daddr) { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .oif = tunnel->parms.link, + .proto = IPPROTO_IPV6 }; struct rtable *rt; - if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) { + if (!ip_route_output_key(&rt, &fl)) { tdev = rt->u.dst.dev; ip_rt_put(rt); } @@ -834,19 +837,14 @@ } static struct inet_protocol sit_protocol = { - ipip6_rcv, - ipip6_err, - 0, - IPPROTO_IPV6, - 0, - NULL, - "IPv6" + .handler = ipip6_rcv, + .err_handler = ipip6_err, }; #ifdef MODULE void sit_cleanup(void) { - inet_del_protocol(&sit_protocol); + inet_del_protocol(&sit_protocol, IPPROTO_IPV6); unregister_netdev(&ipip6_fb_tunnel_dev); } #endif @@ -855,9 +853,13 @@ { printk(KERN_INFO "IPv6 over IPv4 tunneling driver\n"); + if (inet_add_protocol(&sit_protocol, IPPROTO_IPV6) < 0) { + printk(KERN_INFO "sit init: Can't add protocol\n"); + return -EAGAIN; + } + ipip6_fb_tunnel_dev.priv = (void*)&ipip6_fb_tunnel; strcpy(ipip6_fb_tunnel_dev.name, ipip6_fb_tunnel.parms.name); register_netdev(&ipip6_fb_tunnel_dev); - inet_add_protocol(&sit_protocol); return 0; } Index: net/ipv6/tcp_ipv6.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/tcp_ipv6.c,v retrieving revision 1.1.1.30 retrieving revision 1.1.1.30.2.1 diff -u -r1.1.1.30 -r1.1.1.30.2.1 --- a/net/ipv6/tcp_ipv6.c 14 Apr 2004 13:05:41 -0000 1.1.1.30 +++ b/net/ipv6/tcp_ipv6.c 16 Apr 2004 13:16:26 -0000 1.1.1.30.2.1 @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -553,7 +554,6 @@ struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct in6_addr *saddr = NULL; - struct in6_addr saddr_buf; struct flowi fl; struct dst_entry *dst; int addr_type; @@ -565,7 +565,8 @@ if (usin->sin6_family != AF_INET6) return(-EAFNOSUPPORT); - fl.fl6_flowlabel = 0; + memset(&fl, 0, sizeof(fl)); + if (np->sndflow) { fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; IP6_ECN_flow_init(fl.fl6_flowlabel); @@ -659,43 +660,45 @@ saddr = &np->rcv_saddr; fl.proto = IPPROTO_TCP; - fl.fl6_dst = &np->daddr; - fl.fl6_src = saddr; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, + (saddr ? saddr : &np->saddr)); fl.oif = sk->bound_dev_if; - fl.uli_u.ports.dport = usin->sin6_port; - fl.uli_u.ports.sport = sk->sport; + fl.fl_ip_dport = usin->sin6_port; + fl.fl_ip_sport = sk->sport; if (np->opt && np->opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - fl.nl_u.ip6_u.daddr = rt0->addr; + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); } - dst = ip6_route_output(sk, &fl); + err = ip6_dst_lookup(sk, &dst, &fl); - if ((err = dst->error) != 0) { - dst_release(dst); + if (err) goto failure; - } - - ip6_dst_store(sk, dst, NULL); - sk->route_caps = dst->dev->features&~NETIF_F_IP_CSUM; if (saddr == NULL) { - err = ipv6_get_saddr(dst, &np->daddr, &saddr_buf); - if (err) - goto failure; - - saddr = &saddr_buf; + saddr = &fl.fl6_src; + ipv6_addr_copy(&np->rcv_saddr, saddr); } /* set the source address */ - ipv6_addr_copy(&np->rcv_saddr, saddr); ipv6_addr_copy(&np->saddr, saddr); sk->rcv_saddr= LOOPBACK4_IPV6; + ip6_dst_store(sk, dst, NULL); + sk->route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM +#ifdef NETIF_F_TSO + | NETIF_F_TSO +#endif + ); + tp->ext_header_len = 0; if (np->opt) tp->ext_header_len = np->opt->opt_flen+np->opt->opt_nflen; + tp->ext2_header_len = dst->header_len; + tp->mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); sk->dport = usin->sin6_port; @@ -717,8 +720,8 @@ late_failure: tcp_set_state(sk, TCP_CLOSE); -failure: __sk_dst_reset(sk); +failure: sk->dport = 0; sk->route_caps = 0; return err; @@ -781,21 +784,23 @@ to handle rthdr case. Ignore this complexity for now. */ + memset(&fl, 0, sizeof(fl)); fl.proto = IPPROTO_TCP; - fl.nl_u.ip6_u.daddr = &np->daddr; - fl.nl_u.ip6_u.saddr = &np->saddr; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); fl.oif = sk->bound_dev_if; - fl.uli_u.ports.dport = sk->dport; - fl.uli_u.ports.sport = sk->sport; + fl.fl_ip_dport = sk->dport; + fl.fl_ip_sport = sk->sport; - dst = ip6_route_output(sk, &fl); + if ((err = ip6_dst_lookup(sk, &dst, &fl))) { + sk->err_soft = -err; + goto out; + } } else dst_hold(dst); - if (dst->error) { - sk->err_soft = -dst->error; - } else if (tp->pmtu_cookie > dst->pmtu) { - tcp_sync_mss(sk, dst->pmtu); + if (tp->pmtu_cookie > dst_pmtu(dst)) { + tcp_sync_mss(sk, dst_pmtu(dst)); tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ dst_release(dst); @@ -865,13 +870,14 @@ struct flowi fl; int err = -1; + memset(&fl, 0, sizeof(fl)); fl.proto = IPPROTO_TCP; - fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr; - fl.nl_u.ip6_u.saddr = &req->af.v6_req.loc_addr; + ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); + ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr); fl.fl6_flowlabel = 0; fl.oif = req->af.v6_req.iif; - fl.uli_u.ports.dport = req->rmt_port; - fl.uli_u.ports.sport = sk->sport; + fl.fl_ip_dport = req->rmt_port; + fl.fl_ip_sport = sk->sport; if (dst == NULL) { opt = sk->net_pinfo.af_inet6.opt; @@ -886,11 +892,11 @@ if (opt && opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - fl.nl_u.ip6_u.daddr = rt0->addr; + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); } - dst = ip6_route_output(sk, &fl); - if (dst->error) + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) goto done; } @@ -902,7 +908,7 @@ &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr, csum_partial((char *)th, skb->len, skb->csum)); - fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr; + ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); err = ip6_xmit(sk, skb, &fl, opt); if (err == NET_XMIT_CN) err = 0; @@ -970,7 +976,7 @@ if (th->rst) return; - if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) + if (!ipv6_unicast_destination(skb)) return; /* @@ -1003,24 +1009,21 @@ buff->csum = csum_partial((char *)t1, sizeof(*t1), 0); - fl.nl_u.ip6_u.daddr = &skb->nh.ipv6h->saddr; - fl.nl_u.ip6_u.saddr = &skb->nh.ipv6h->daddr; - fl.fl6_flowlabel = 0; + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &skb->nh.ipv6h->daddr); - t1->check = csum_ipv6_magic(fl.nl_u.ip6_u.saddr, - fl.nl_u.ip6_u.daddr, + t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, sizeof(*t1), IPPROTO_TCP, buff->csum); fl.proto = IPPROTO_TCP; fl.oif = tcp_v6_iif(skb); - fl.uli_u.ports.dport = t1->dest; - fl.uli_u.ports.sport = t1->source; + fl.fl_ip_dport = t1->dest; + fl.fl_ip_sport = t1->source; /* sk = NULL, but it is safe for now. RST socket required. */ - buff->dst = ip6_route_output(NULL, &fl); - - if (buff->dst->error == 0) { + if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { ip6_xmit(NULL, buff, &fl, NULL); TCP_INC_STATS_BH(TcpOutSegs); TCP_INC_STATS_BH(TcpOutRsts); @@ -1070,23 +1073,20 @@ buff->csum = csum_partial((char *)t1, tot_len, 0); - fl.nl_u.ip6_u.daddr = &skb->nh.ipv6h->saddr; - fl.nl_u.ip6_u.saddr = &skb->nh.ipv6h->daddr; - fl.fl6_flowlabel = 0; + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &skb->nh.ipv6h->daddr); - t1->check = csum_ipv6_magic(fl.nl_u.ip6_u.saddr, - fl.nl_u.ip6_u.daddr, + t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, tot_len, IPPROTO_TCP, buff->csum); fl.proto = IPPROTO_TCP; fl.oif = tcp_v6_iif(skb); - fl.uli_u.ports.dport = t1->dest; - fl.uli_u.ports.sport = t1->source; + fl.fl_ip_dport = t1->dest; + fl.fl_ip_sport = t1->source; - buff->dst = ip6_route_output(NULL, &fl); - - if (buff->dst->error == 0) { + if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { ip6_xmit(NULL, buff, &fl, NULL); TCP_INC_STATS_BH(TcpOutSegs); return; @@ -1177,8 +1177,7 @@ if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); - /* FIXME: do the same check for anycast */ - if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) + if (!ipv6_unicast_destination(skb)) goto drop; /* @@ -1248,7 +1247,6 @@ struct dst_entry *dst) { struct ipv6_pinfo *np; - struct flowi fl; struct tcp_opt *newtp; struct sock *newsk; struct ipv6_txoptions *opt; @@ -1310,23 +1308,23 @@ } if (dst == NULL) { + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); fl.proto = IPPROTO_TCP; - fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr; + ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); if (opt && opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - fl.nl_u.ip6_u.daddr = rt0->addr; + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); } - fl.nl_u.ip6_u.saddr = &req->af.v6_req.loc_addr; - fl.fl6_flowlabel = 0; + ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr); fl.oif = sk->bound_dev_if; - fl.uli_u.ports.dport = req->rmt_port; - fl.uli_u.ports.sport = sk->sport; - - dst = ip6_route_output(sk, &fl); - } + fl.fl_ip_dport = req->rmt_port; + fl.fl_ip_sport = sk->sport; - if (dst->error) - goto out; + if (ip6_dst_lookup(sk, &dst, &fl)) + goto out; + } newsk = tcp_create_openreq_child(sk, req, skb); if (newsk == NULL) @@ -1339,7 +1337,12 @@ MOD_INC_USE_COUNT; ip6_dst_store(newsk, dst, NULL); - sk->route_caps = dst->dev->features&~NETIF_F_IP_CSUM; + newsk->route_caps = dst->dev->features& + ~(NETIF_F_IP_CSUM +#ifdef NETIF_F_TSO + | NETIF_F_TSO +#endif + ); newtp = &(newsk->tp_pinfo.af_tcp); @@ -1387,8 +1390,10 @@ if (np->opt) newtp->ext_header_len = np->opt->opt_nflen + np->opt->opt_flen; - tcp_sync_mss(newsk, dst->pmtu); - newtp->advmss = dst->advmss; + newtp->ext2_header_len = dst->header_len; + + tcp_sync_mss(newsk, dst_pmtu(dst)); + newtp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(newsk); newsk->daddr = LOOPBACK4_IPV6; @@ -1557,8 +1562,9 @@ return 0; } -int tcp_v6_rcv(struct sk_buff *skb) +static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) { + struct sk_buff *skb = *pskb; struct tcphdr *th; struct sock *sk; int ret; @@ -1601,11 +1607,12 @@ goto no_tcp_socket; process: - if(!ipsec_sk_policy(sk,skb)) - goto discard_and_relse; if(sk->state == TCP_TIME_WAIT) goto do_time_wait; + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + if (sk_filter(sk, skb, 0)) goto discard_and_relse; @@ -1621,9 +1628,12 @@ bh_unlock_sock(sk); sock_put(sk); - return ret; + return ret ? -1 : 0; no_tcp_socket: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { bad_packet: TCP_INC_STATS_BH(TcpInErrs); @@ -1645,6 +1655,10 @@ goto discard_it; do_time_wait: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + sock_put(sk); + goto discard_it; + } if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TcpInErrs); tcp_tw_put((struct tcp_tw_bucket *) sk); @@ -1688,30 +1702,35 @@ if (dst == NULL) { struct flowi fl; + memset(&fl, 0, sizeof(fl)); fl.proto = IPPROTO_TCP; - fl.nl_u.ip6_u.daddr = &np->daddr; - fl.nl_u.ip6_u.saddr = &np->saddr; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); fl.fl6_flowlabel = np->flow_label; fl.oif = sk->bound_dev_if; - fl.uli_u.ports.dport = sk->dport; - fl.uli_u.ports.sport = sk->sport; + fl.fl_ip_dport = sk->dport; + fl.fl_ip_sport = sk->sport; if (np->opt && np->opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - fl.nl_u.ip6_u.daddr = rt0->addr; + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); } - dst = ip6_route_output(sk, &fl); + err = ip6_dst_lookup(sk, &dst, &fl); - if (dst->error) { - err = dst->error; - dst_release(dst); + if (err) { sk->route_caps = 0; return err; } ip6_dst_store(sk, dst, NULL); - sk->route_caps = dst->dev->features&~NETIF_F_IP_CSUM; + sk->route_caps = dst->dev->features& + ~(NETIF_F_IP_CSUM +#ifdef NETIF_F_TSO + | NETIF_F_TSO +#endif + ); + tcp_sk(sk)->ext2_header_len = dst->header_len; } return 0; @@ -1724,38 +1743,45 @@ struct flowi fl; struct dst_entry *dst; + memset(&fl, 0, sizeof(fl)); fl.proto = IPPROTO_TCP; - fl.fl6_dst = &np->daddr; - fl.fl6_src = &np->saddr; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); fl.fl6_flowlabel = np->flow_label; IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); fl.oif = sk->bound_dev_if; - fl.uli_u.ports.sport = sk->sport; - fl.uli_u.ports.dport = sk->dport; + fl.fl_ip_sport = sk->sport; + fl.fl_ip_dport = sk->dport; if (np->opt && np->opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - fl.nl_u.ip6_u.daddr = rt0->addr; + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); } dst = __sk_dst_check(sk, np->dst_cookie); if (dst == NULL) { - dst = ip6_route_output(sk, &fl); + int err = ip6_dst_lookup(sk, &dst, &fl); - if (dst->error) { - sk->err_soft = -dst->error; - dst_release(dst); - return -sk->err_soft; + if (err) { + sk->err_soft = -err; + return err; } ip6_dst_store(sk, dst, NULL); + sk->route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM +#ifdef NETIF_F_TSO + | NETIF_F_TSO +#endif + ); + tcp_sk(sk)->ext2_header_len = dst->header_len; } skb->dst = dst_clone(dst); /* Restore final destination back after routing done */ - fl.nl_u.ip6_u.daddr = &np->daddr; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); return ip6_xmit(sk, skb, &fl, np->opt); } @@ -1865,6 +1891,7 @@ static int tcp_v6_destroy_sock(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct inet_opt *inet = inet_sk(sk); tcp_clear_xmit_timers(sk); @@ -1882,8 +1909,8 @@ tcp_put_port(sk); /* If sendmsg cached page exists, toss it. */ - if (tp->sndmsg_page != NULL) - __free_page(tp->sndmsg_page); + if (inet->sndmsg_page != NULL) + __free_page(inet->sndmsg_page); atomic_dec(&tcp_sockets_allocated); @@ -2143,15 +2170,10 @@ get_port: tcp_v6_get_port, }; -static struct inet6_protocol tcpv6_protocol = -{ - tcp_v6_rcv, /* TCP handler */ - tcp_v6_err, /* TCP error control */ - NULL, /* next */ - IPPROTO_TCP, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "TCPv6" /* name */ +static struct inet6_protocol tcpv6_protocol = { + .handler = tcp_v6_rcv, + .err_handler = tcp_v6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; extern struct proto_ops inet6_stream_ops; @@ -2169,6 +2191,7 @@ void __init tcpv6_init(void) { /* register inet6 protocol */ - inet6_add_protocol(&tcpv6_protocol); + if (inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP) < 0) + printk(KERN_ERR "tcpv6_init: Could not register protocol\n"); inet6_register_protosw(&tcpv6_protosw); } Index: net/ipv6/udp.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/udp.c,v retrieving revision 1.1.1.26 retrieving revision 1.1.1.26.2.1 diff -u -r1.1.1.26 -r1.1.1.26.2.1 --- a/net/ipv6/udp.c 14 Apr 2004 13:05:41 -0000 1.1.1.26 +++ b/net/ipv6/udp.c 16 Apr 2004 13:16:26 -0000 1.1.1.26.2.1 @@ -14,6 +14,7 @@ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind * a single port at the same time. + * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -50,6 +51,7 @@ #include #include +#include struct udp_mib udp_stats_in6[NR_CPUS*2]; @@ -226,7 +228,6 @@ struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct in6_addr *daddr; - struct in6_addr saddr; struct dst_entry *dst; struct flowi fl; struct ip6_flowlabel *flowlabel = NULL; @@ -246,7 +247,7 @@ if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; - fl.fl6_flowlabel = 0; + memset(&fl, 0, sizeof(fl)); if (np->sndflow) { fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { @@ -271,9 +272,10 @@ if (addr_type == IPV6_ADDR_MAPPED) { struct sockaddr_in sin; - if (__ipv6_only_sock(sk)) - return -ENETUNREACH; - + if (__ipv6_only_sock(sk)) { + err = -ENETUNREACH; + goto out; + } sin.sin_family = AF_INET; sin.sin_addr.s_addr = daddr->s6_addr32[3]; sin.sin_port = usin->sin6_port; @@ -281,8 +283,8 @@ err = udp_connect(sk, (struct sockaddr*) &sin, sizeof(sin)); ipv4_connected: - if (err < 0) - return err; + if (err) + goto out; ipv6_addr_set(&np->daddr, 0, 0, htonl(0x0000ffff), @@ -299,15 +301,15 @@ htonl(0x0000ffff), sk->rcv_saddr); } - return 0; + goto out; } if (addr_type&IPV6_ADDR_LINKLOCAL) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { if (sk->bound_dev_if && sk->bound_dev_if != usin->sin6_scope_id) { - fl6_sock_release(flowlabel); - return -EINVAL; + err = -EINVAL; + goto out; } sk->bound_dev_if = usin->sin6_scope_id; if (!sk->bound_dev_if && (addr_type&IPV6_ADDR_MULTICAST)) @@ -315,8 +317,10 @@ } /* Connect to link-local address requires an interface */ - if (sk->bound_dev_if == 0) - return -EINVAL; + if (sk->bound_dev_if == 0) { + err = -EINVAL; + goto out; + } } ipv6_addr_copy(&np->daddr, daddr); @@ -330,11 +334,11 @@ */ fl.proto = IPPROTO_UDP; - fl.fl6_dst = &np->daddr; - fl.fl6_src = &saddr; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); fl.oif = sk->bound_dev_if; - fl.uli_u.ports.dport = sk->dport; - fl.uli_u.ports.sport = sk->sport; + fl.fl_ip_dport = sk->dport; + fl.fl_ip_sport = sk->sport; if (!fl.oif && (addr_type&IPV6_ADDR_MULTICAST)) fl.oif = np->mcast_oif; @@ -342,37 +346,33 @@ if (flowlabel) { if (flowlabel->opt && flowlabel->opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) flowlabel->opt->srcrt; - fl.fl6_dst = rt0->addr; + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); } } else if (np->opt && np->opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - fl.fl6_dst = rt0->addr; + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); } - dst = ip6_route_output(sk, &fl); - - if ((err = dst->error) != 0) { - dst_release(dst); - fl6_sock_release(flowlabel); - return err; - } + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto out; - ip6_dst_store(sk, dst, fl.fl6_dst); + /* source address lookup done in ip6_dst_lookup */ - /* get the source adddress used in the apropriate device */ + if (ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&np->saddr, &fl.fl6_src); - err = ipv6_get_saddr(dst, daddr, &saddr); + if (ipv6_addr_any(&np->rcv_saddr)) { + ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src); + sk->rcv_saddr = LOOPBACK4_IPV6; + } - if (err == 0) { - if(ipv6_addr_any(&np->saddr)) - ipv6_addr_copy(&np->saddr, &saddr); + ip6_dst_store(sk, dst, + !ipv6_addr_cmp(&fl.fl6_dst, &np->daddr) ? + &np->daddr : NULL); - if(ipv6_addr_any(&np->rcv_saddr)) { - ipv6_addr_copy(&np->rcv_saddr, &saddr); - sk->rcv_saddr = LOOPBACK4_IPV6; - } - sk->state = TCP_ESTABLISHED; - } + sk->state = TCP_ESTABLISHED; +out: fl6_sock_release(flowlabel); return err; @@ -521,6 +521,11 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) { + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return -1; + } + #if defined(CONFIG_FILTER) if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) { if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { @@ -617,8 +622,9 @@ read_unlock(&udp_hash_lock); } -int udpv6_rcv(struct sk_buff *skb) +static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) { + struct sk_buff *skb = *pskb; struct sock *sk; struct udphdr *uh; struct net_device *dev = skb->dev; @@ -685,6 +691,9 @@ sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest, dev->ifindex); if (sk == NULL) { + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard; + if (skb->ip_summed != CHECKSUM_UNNECESSARY && (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) goto discard; @@ -711,103 +720,126 @@ kfree_skb(skb); return(0); } - /* - * Sending + * Throw away all pending data and cancel the corking. Socket is locked. */ - -struct udpv6fakehdr +static void udp_v6_flush_pending_frames(struct sock *sk) { - struct udphdr uh; - struct iovec *iov; - __u32 wcheck; - __u32 pl_len; - struct in6_addr *daddr; -}; + struct udp_opt *up = udp_sk(sk); + + if (up->pending) { + up->len = 0; + up->pending = 0; + ip6_flush_pending_frames(sk); + } +} /* - * with checksum + * Sending */ -static int udpv6_getfrag(const void *data, struct in6_addr *addr, - char *buff, unsigned int offset, unsigned int len) +static int udp_v6_push_pending_frames(struct sock *sk, struct udp_opt *up) { - struct udpv6fakehdr *udh = (struct udpv6fakehdr *) data; - char *dst; - int final = 0; - int clen = len; + struct sk_buff *skb; + struct udphdr *uh; + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi *fl = &np->cork.fl; + int err = 0; - dst = buff; + /* Grab the skbuff where UDP header space exists. */ + if ((skb = skb_peek(&sk->write_queue)) == NULL) + goto out; - if (offset) { - offset -= sizeof(struct udphdr); + /* + * Create a UDP header + */ + uh = skb->h.uh; + uh->source = fl->fl_ip_sport; + uh->dest = fl->fl_ip_dport; + uh->len = htons(up->len); + uh->check = 0; + + if (sk->no_check == UDP_CSUM_NOXMIT) { + skb->ip_summed = CHECKSUM_NONE; + goto send; + } + + if (skb_queue_len(&sk->write_queue) == 1) { + skb->csum = csum_partial((char *)uh, + sizeof(struct udphdr), skb->csum); + uh->check = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + up->len, fl->proto, skb->csum); } else { - dst += sizeof(struct udphdr); - final = 1; - clen -= sizeof(struct udphdr); - } + u32 tmp_csum = 0; - if (csum_partial_copy_fromiovecend(dst, udh->iov, offset, - clen, &udh->wcheck)) - return -EFAULT; - - if (final) { - struct in6_addr *daddr; - - udh->wcheck = csum_partial((char *)udh, sizeof(struct udphdr), - udh->wcheck); - - if (udh->daddr) { - daddr = udh->daddr; - } else { - /* - * use packet destination address - * this should improve cache locality - */ - daddr = addr + 1; - } - udh->uh.check = csum_ipv6_magic(addr, daddr, - udh->pl_len, IPPROTO_UDP, - udh->wcheck); - if (udh->uh.check == 0) - udh->uh.check = -1; + skb_queue_walk(&sk->write_queue, skb) { + tmp_csum = csum_add(tmp_csum, skb->csum); + } + tmp_csum = csum_partial((char *)uh, + sizeof(struct udphdr), tmp_csum); + tmp_csum = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + up->len, fl->proto, tmp_csum); + uh->check = tmp_csum; - memcpy(buff, udh, sizeof(struct udphdr)); } - return 0; + if (uh->check == 0) + uh->check = -1; + +send: + err = ip6_push_pending_frames(sk); +out: + up->len = 0; + up->pending = 0; + return err; } -static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) +static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) { struct ipv6_txoptions opt_space; - struct udpv6fakehdr udh; + struct udp_opt *up = udp_sk(sk); struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name; + struct in6_addr *daddr; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; - struct flowi fl; + struct flowi *fl = &np->cork.fl; + struct dst_entry *dst; int addr_len = msg->msg_namelen; - struct in6_addr *daddr; - int len = ulen + sizeof(struct udphdr); + int ulen = len; int addr_type; int hlimit = -1; - + int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int err; /* Rough check on arithmetic overflow, better check is made in ip6_build_xmit */ - if (ulen < 0 || ulen > INT_MAX - sizeof(struct udphdr)) + if (len < 0 || len > INT_MAX - sizeof(struct udphdr)) return -EMSGSIZE; - fl.fl6_flowlabel = 0; - fl.oif = 0; + if (up->pending) { + /* + * There are pending frames. + * The socket lock must be held while it's corked. + */ + lock_sock(sk); + if (likely(up->pending)) { + dst = NULL; + goto do_append_data; + } + release_sock(sk); + } + ulen += sizeof(struct udphdr); + + memset(fl, 0, sizeof(fl)); if (sin6) { if (sin6->sin6_family == AF_INET) { if (__ipv6_only_sock(sk)) return -ENETUNREACH; - return udp_sendmsg(sk, msg, ulen); + return udp_sendmsg(sk, msg, len); } if (addr_len < SIN6_LEN_RFC2133) @@ -819,13 +851,13 @@ if (sin6->sin6_port == 0) return -EINVAL; - udh.uh.dest = sin6->sin6_port; + up->dport = sin6->sin6_port; daddr = &sin6->sin6_addr; if (np->sndflow) { - fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; - if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + fl->fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel); if (flowlabel == NULL) return -EINVAL; daddr = &flowlabel->dst; @@ -840,14 +872,14 @@ if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) - fl.oif = sin6->sin6_scope_id; + fl->oif = sin6->sin6_scope_id; } else { if (sk->state != TCP_ESTABLISHED) return -EDESTADDRREQ; - udh.uh.dest = sk->dport; + up->dport = sk->dport; daddr = &sk->net_pinfo.af_inet6.daddr; - fl.fl6_flowlabel = np->flow_label; + fl->fl6_flowlabel = np->flow_label; } addr_type = ipv6_addr_type(daddr); @@ -860,30 +892,28 @@ sin.sin_family = AF_INET; sin.sin_addr.s_addr = daddr->s6_addr32[3]; - sin.sin_port = udh.uh.dest; + sin.sin_port = up->dport; msg->msg_name = (struct sockaddr *)(&sin); msg->msg_namelen = sizeof(sin); fl6_sock_release(flowlabel); - return udp_sendmsg(sk, msg, ulen); + return udp_sendmsg(sk, msg, len); } - udh.daddr = NULL; - if (!fl.oif) - fl.oif = sk->bound_dev_if; - fl.fl6_src = NULL; + if (!fl->oif) + fl->oif = sk->bound_dev_if; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); - err = datagram_send_ctl(msg, &fl, opt, &hlimit); + err = datagram_send_ctl(msg, fl, opt, &hlimit); if (err < 0) { fl6_sock_release(flowlabel); return err; } - if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if ((fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel); if (flowlabel == NULL) return -EINVAL; } @@ -894,44 +924,181 @@ opt = np->opt; if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); - if (opt && opt->srcrt) - udh.daddr = daddr; - udh.uh.source = sk->sport; - udh.uh.len = len < 0x10000 ? htons(len) : 0; - udh.uh.check = 0; - udh.iov = msg->msg_iov; - udh.wcheck = 0; - udh.pl_len = len; + fl->proto = IPPROTO_UDP; + ipv6_addr_copy(&fl->fl6_dst, daddr); + if (ipv6_addr_any(&fl->fl6_src) && !ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&fl->fl6_src, &np->saddr); + fl->fl_ip_dport = up->dport; + fl->fl_ip_sport = sk->sport; + + /* merge ip6_build_xmit from ip6_output */ + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&fl->fl6_dst, rt0->addr); + } - fl.proto = IPPROTO_UDP; - fl.fl6_dst = daddr; - if (fl.fl6_src == NULL && !ipv6_addr_any(&np->saddr)) - fl.fl6_src = &np->saddr; - fl.uli_u.ports.dport = udh.uh.dest; - fl.uli_u.ports.sport = udh.uh.source; + if (!fl->oif && ipv6_addr_is_multicast(&fl->fl6_dst)) + fl->oif = np->mcast_oif; + + err = ip6_dst_lookup(sk, &dst, fl); + if (err) + goto out; - err = ip6_build_xmit(sk, udpv6_getfrag, &udh, &fl, len, opt, hlimit, - msg->msg_flags); + if (hlimit < 0) { + if (ipv6_addr_is_multicast(&fl->fl6_dst)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + } + + if (msg->msg_flags&MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + lock_sock(sk); + if (unlikely(up->pending)) { + /* The socket is already corked while preparing it. */ + /* ... which is an evident application bug. --ANK */ + release_sock(sk); + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n")); + err = -EINVAL; + goto out; + } + + up->pending = 1; + +do_append_data: + up->len += ulen; + err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen, sizeof(struct udphdr), + hlimit, opt, fl, (struct rt6_info*)dst, + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); + if (err) + udp_v6_flush_pending_frames(sk); + else if (!corkreq) + err = udp_v6_push_pending_frames(sk, up); + + if (dst) + ip6_dst_store(sk, dst, + !ipv6_addr_cmp(&fl->fl6_dst, &np->daddr) ? + &np->daddr : NULL); + if (err > 0) + err = np->recverr ? net_xmit_errno(err) : 0; + release_sock(sk); +out: fl6_sock_release(flowlabel); + if (!err) { + UDP6_INC_STATS_USER(UdpOutDatagrams); + return len; + } + return err; + +do_confirm: + dst_confirm(dst); + if (!(msg->msg_flags&MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto out; +} + +static int udpv6_destroy_sock(struct sock *sk) +{ + lock_sock(sk); + udp_v6_flush_pending_frames(sk); + release_sock(sk); - if (err < 0) - return err; + inet6_destroy_sock(sk); - UDP6_INC_STATS_USER(UdpOutDatagrams); - return ulen; + return 0; } -static struct inet6_protocol udpv6_protocol = +/* + * Socket option code for UDP + */ +static int udpv6_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) { - udpv6_rcv, /* UDP handler */ - udpv6_err, /* UDP error control */ - NULL, /* next */ - IPPROTO_UDP, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "UDPv6" /* name */ + struct udp_opt *up = udp_sk(sk); + int val; + int err = 0; + + if (level != SOL_UDP) + return ipv6_setsockopt(sk, level, optname, optval, optlen); + + if(optlencorkflag = 1; + } else { + up->corkflag = 0; + lock_sock(sk); + udp_v6_push_pending_frames(sk, up); + release_sock(sk); + } + break; + + case UDP_ENCAP: + up->encap_type = val; + break; + + default: + err = -ENOPROTOOPT; + break; + }; + + return err; +} + +static int udpv6_getsockopt(struct sock *sk, int level, int optname, + char *optval, int *optlen) +{ + struct udp_opt *up = udp_sk(sk); + int val, len; + + if (level != SOL_UDP) + return ipv6_getsockopt(sk, level, optname, optval, optlen); + + if(get_user(len,optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + if(len < 0) + return -EINVAL; + + switch(optname) { + case UDP_CORK: + val = up->corkflag; + break; + + case UDP_ENCAP: + val = up->encap_type; + break; + + default: + return -ENOPROTOOPT; + }; + + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, &val,len)) + return -EFAULT; + return 0; +} + +static struct inet6_protocol udpv6_protocol = { + .handler = udpv6_rcv, + .err_handler = udpv6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; #define LINE_LEN 190 @@ -1008,20 +1175,20 @@ } struct proto udpv6_prot = { - name: "UDP", - close: udpv6_close, - connect: udpv6_connect, - disconnect: udp_disconnect, - ioctl: udp_ioctl, - destroy: inet6_destroy_sock, - setsockopt: ipv6_setsockopt, - getsockopt: ipv6_getsockopt, - sendmsg: udpv6_sendmsg, - recvmsg: udpv6_recvmsg, - backlog_rcv: udpv6_queue_rcv_skb, - hash: udp_v6_hash, - unhash: udp_v6_unhash, - get_port: udp_v6_get_port, + .name = "UDP", + .close = udpv6_close, + .connect = udpv6_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .destroy = udpv6_destroy_sock, + .setsockopt = udpv6_setsockopt, + .getsockopt = udpv6_getsockopt, + .sendmsg = udpv6_sendmsg, + .recvmsg = udpv6_recvmsg, + .backlog_rcv = udpv6_queue_rcv_skb, + .hash = udp_v6_hash, + .unhash = udp_v6_unhash, + .get_port = udp_v6_get_port, }; extern struct proto_ops inet6_dgram_ops; @@ -1039,6 +1206,7 @@ void __init udpv6_init(void) { - inet6_add_protocol(&udpv6_protocol); + if (inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP) < 0) + printk(KERN_ERR "udpv6_init: Could not register protocol\n"); inet6_register_protosw(&udpv6_protosw); } Index: net/ipv6/xfrm6_input.c =================================================================== RCS file: net/ipv6/xfrm6_input.c diff -N net/ipv6/xfrm6_input.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv6/xfrm6_input.c 16 Apr 2004 13:16:26 -0000 1.7.14.1 @@ -0,0 +1,142 @@ +/* + * xfrm6_input.c: based on net/ipv4/xfrm4_input.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * YOSHIFUJI Hideaki @USAGI + * IPv6 support + */ + +#include +#include +#include +#include +#include + +static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) +{ + struct ipv6hdr *outer_iph = skb->nh.ipv6h; + struct ipv6hdr *inner_iph = skb->h.ipv6h; + + if (INET_ECN_is_ce(ip6_get_dsfield(outer_iph)) && + INET_ECN_is_not_ce(ip6_get_dsfield(inner_iph))) + IP6_ECN_set_ce(inner_iph); +} + +int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + struct sk_buff *skb = *pskb; + int err; + u32 spi, seq; + struct sec_decap_state xfrm_vec[XFRM_MAX_DEPTH]; + struct xfrm_state *x; + int xfrm_nr = 0; + int decaps = 0; + int nexthdr = 0; + u8 *prevhdr = NULL; + int hhlen; + + ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + *nhoffp = prevhdr - skb->nh.raw; + hhlen = skb->nh.raw - skb->mac.raw; + + if ((err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) + goto drop; + + do { + struct ipv6hdr *iph = skb->nh.ipv6h; + + if (xfrm_nr == XFRM_MAX_DEPTH) + goto drop_put; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, nexthdr, AF_INET6); + if (x == NULL) + goto drop_put; + spin_lock(&x->lock); + if (unlikely(x->km.state != XFRM_STATE_VALID)) + goto drop_unlock; + + if (x->props.replay_window && xfrm_replay_check(x, seq)) + goto drop_unlock; + + if (xfrm_state_check_expire(x)) + goto drop_unlock; + + nexthdr = x->type->input(x, &(xfrm_vec[xfrm_nr].decap), skb); + if (nexthdr <= 0) + goto drop_unlock; + + if (x->props.replay_window) + xfrm_replay_advance(x, seq); + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock(&x->lock); + + xfrm_vec[xfrm_nr++].xvec = x; + + if (x->props.mode) { /* XXX */ + if (nexthdr != IPPROTO_IPV6) + goto drop_put; + decaps = 1; + break; + } + + if ((err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) < 0) + goto drop_put; + } while (!err); + + /* Allocate new secpath or COW existing one. */ + if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { + struct sec_path *sp; + sp = secpath_dup(skb->sp); + if (!sp) + goto drop_put; + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + } + + if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH) + goto drop_put; + + memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state)); + skb->sp->len += xfrm_nr; + skb->ip_summed = CHECKSUM_NONE; + + if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto drop; + + if (decaps) { + skb->mac.raw = memmove(skb->data - hhlen, skb->mac.raw, hhlen); + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto drop; + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip6_ecn_decapsulate(skb); + skb->nh.raw = skb->data; + if (!(skb->dev->flags&IFF_LOOPBACK)) { + dst_release(skb->dst); + skb->dst = NULL; + } + netif_rx(skb); + return -1; + } else { + skb->mac.raw = memmove(skb->nh.raw - hhlen, skb->mac.raw, + hhlen); + return 1; + } + +drop_unlock: + spin_unlock(&x->lock); + xfrm_state_put(x); +drop_put: + while (--xfrm_nr >= 0) + xfrm_state_put(xfrm_vec[xfrm_nr].xvec); +drop: + kfree_skb(skb); + return -1; +} Index: net/ipv6/xfrm6_policy.c =================================================================== RCS file: net/ipv6/xfrm6_policy.c diff -N net/ipv6/xfrm6_policy.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv6/xfrm6_policy.c 16 Apr 2004 13:16:26 -0000 1.8.12.1 @@ -0,0 +1,296 @@ +/* + * xfrm6_policy.c: based on xfrm4_policy.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * YOSHIFUJI Hideaki + * Split up af-specific portion + * + */ + +#include +#include +#include +#include +#include + +extern struct dst_ops xfrm6_dst_ops; +extern struct xfrm_policy_afinfo xfrm6_policy_afinfo; + +static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED }; + +int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) +{ + int err = 0; + *dst = (struct xfrm_dst*)ip6_route_output(NULL, fl); + if (!*dst) + err = -ENETUNREACH; + return err; +} + +/* Check that the bundle accepts the flow and its components are + * still valid. + */ + +static int __xfrm6_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl) +{ + do { + if (xdst->u.dst.ops != &xfrm6_dst_ops) + return 1; + + if (!xfrm_selector_match(&xdst->u.dst.xfrm->sel, fl, AF_INET6)) + return 0; + if (xdst->u.dst.xfrm->km.state != XFRM_STATE_VALID || + xdst->u.dst.path->obsolete > 0) + return 0; + xdst = (struct xfrm_dst*)xdst->u.dst.child; + } while (xdst); + return 0; +} + +static struct dst_entry * +__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy) +{ + struct dst_entry *dst; + u32 ndisc_bit = 0; + + if (fl->proto == IPPROTO_ICMPV6 && + (fl->fl_icmp_type == NDISC_NEIGHBOUR_ADVERTISEMENT || + fl->fl_icmp_type == NDISC_NEIGHBOUR_SOLICITATION || + fl->fl_icmp_type == NDISC_ROUTER_SOLICITATION)) + ndisc_bit = RTF_NDISC; + + /* Still not clear if we should set fl->fl6_{src,dst}... */ + read_lock_bh(&policy->lock); + for (dst = policy->bundles; dst; dst = dst->next) { + struct xfrm_dst *xdst = (struct xfrm_dst*)dst; + struct in6_addr fl_dst_prefix, fl_src_prefix; + + if ((xdst->u.rt6.rt6i_flags & RTF_NDISC) != ndisc_bit) + continue; + + ipv6_addr_prefix(&fl_dst_prefix, + &fl->fl6_dst, + xdst->u.rt6.rt6i_dst.plen); + ipv6_addr_prefix(&fl_src_prefix, + &fl->fl6_src, + xdst->u.rt6.rt6i_src.plen); + if (!ipv6_addr_cmp(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) && + !ipv6_addr_cmp(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) && + __xfrm6_bundle_ok(xdst, fl)) { + dst_clone(dst); + break; + } + } + read_unlock_bh(&policy->lock); + return dst; +} + +/* Allocate chain of dst_entry's, attach known xfrm's, calculate + * all the metrics... Shortly, bundle a bundle. + */ + +static int +__xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, + struct flowi *fl, struct dst_entry **dst_p) +{ + struct dst_entry *dst, *dst_prev; + struct rt6_info *rt0 = (struct rt6_info*)(*dst_p); + struct rt6_info *rt = rt0; + struct in6_addr *remote = &fl->fl6_dst; + struct in6_addr *local = &fl->fl6_src; + int i; + int err = 0; + int header_len = 0; + int trailer_len = 0; + + dst = dst_prev = NULL; + + for (i = 0; i < nx; i++) { + struct dst_entry *dst1 = dst_alloc(&xfrm6_dst_ops); + + if (unlikely(dst1 == NULL)) { + err = -ENOBUFS; + goto error; + } + + dst1->xfrm = xfrm[i]; + if (!dst) + dst = dst1; + else { + dst_prev->child = dst1; + dst1->flags |= DST_NOHASH; + dst_clone(dst1); + } + dst_prev = dst1; + if (xfrm[i]->props.mode) { + remote = (struct in6_addr*)&xfrm[i]->id.daddr; + local = (struct in6_addr*)&xfrm[i]->props.saddr; + } + header_len += xfrm[i]->props.header_len; + trailer_len += xfrm[i]->props.trailer_len; + } + + if (ipv6_addr_cmp(remote, &fl->fl6_dst)) { + struct flowi fl_tunnel; + + memset(&fl_tunnel, 0, sizeof(fl_tunnel)); + ipv6_addr_copy(&fl_tunnel.fl6_dst, remote); + ipv6_addr_copy(&fl_tunnel.fl6_src, local); + + err = xfrm_dst_lookup((struct xfrm_dst **) &rt, + &fl_tunnel, AF_INET6); + if (err) + goto error; + } else { + dst_hold(&rt->u.dst); + } + dst_prev->child = &rt->u.dst; + for (dst_prev = dst; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { + struct xfrm_dst *x = (struct xfrm_dst*)dst_prev; + + dst_prev->dev = rt->u.dst.dev; + if (rt->u.dst.dev) + dev_hold(rt->u.dst.dev); + dst_prev->obsolete = -1; + dst_prev->flags |= DST_HOST; + dst_prev->lastuse = jiffies; + dst_prev->header_len = header_len; + dst_prev->trailer_len = trailer_len; + memcpy(&dst_prev->metrics, &rt->u.dst.metrics, sizeof(dst_prev->metrics)); + dst_prev->path = &rt->u.dst; + + /* Copy neighbour for reachability confirmation */ + dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour); + dst_prev->input = rt->u.dst.input; + dst_prev->output = dst_prev->xfrm->type->output; + /* Sheit... I remember I did this right. Apparently, + * it was magically lost, so this code needs audit */ + x->u.rt6.rt6i_flags = rt0->rt6i_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL|RTF_NDISC); + x->u.rt6.rt6i_metric = rt0->rt6i_metric; + x->u.rt6.rt6i_node = rt0->rt6i_node; + x->u.rt6.rt6i_gateway = rt0->rt6i_gateway; + memcpy(&x->u.rt6.rt6i_gateway, &rt0->rt6i_gateway, sizeof(x->u.rt6.rt6i_gateway)); + x->u.rt6.rt6i_dst = rt0->rt6i_dst; + x->u.rt6.rt6i_src = rt0->rt6i_src; + header_len -= x->u.dst.xfrm->props.header_len; + trailer_len -= x->u.dst.xfrm->props.trailer_len; + } + *dst_p = dst; + return 0; + +error: + if (dst) + dst_free(dst); + return err; +} + +static inline void +_decode_session6(struct sk_buff *skb, struct flowi *fl) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6hdr *hdr = skb->nh.ipv6h; + struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + u8 nexthdr = skb->nh.ipv6h->nexthdr; + + memset(fl, 0, sizeof(struct flowi)); + ipv6_addr_copy(&fl->fl6_dst, &hdr->daddr); + ipv6_addr_copy(&fl->fl6_src, &hdr->saddr); + + while (pskb_may_pull(skb, skb->nh.raw + offset + 1 - skb->data)) { + switch (nexthdr) { + case NEXTHDR_ROUTING: + case NEXTHDR_HOP: + case NEXTHDR_DEST: + offset += ipv6_optlen(exthdr); + nexthdr = exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + break; + + case IPPROTO_UDP: + case IPPROTO_TCP: + case IPPROTO_SCTP: + if (pskb_may_pull(skb, skb->nh.raw + offset + 4 - skb->data)) { + u16 *ports = (u16 *)exthdr; + + fl->fl_ip_sport = ports[0]; + fl->fl_ip_dport = ports[1]; + } + fl->proto = nexthdr; + return; + + /* XXX Why are there these headers? */ + case IPPROTO_AH: + case IPPROTO_ESP: + case IPPROTO_COMP: + default: + fl->fl_ipsec_spi = 0; + fl->proto = nexthdr; + return; + }; + } +} + +static inline int xfrm6_garbage_collect(void) +{ + read_lock(&xfrm6_policy_afinfo.lock); + xfrm6_policy_afinfo.garbage_collect(); + read_unlock(&xfrm6_policy_afinfo.lock); + return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2); +} + +static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct dst_entry *path = dst->path; + + if (mtu >= 1280 && mtu < dst_pmtu(dst)) + return; + + path->ops->update_pmtu(path, mtu); +} + +struct dst_ops xfrm6_dst_ops = { + .family = AF_INET6, + .protocol = __constant_htons(ETH_P_IPV6), + .gc = xfrm6_garbage_collect, + .update_pmtu = xfrm6_update_pmtu, + .gc_thresh = 1024, + .entry_size = sizeof(struct xfrm_dst), +}; + +struct xfrm_policy_afinfo xfrm6_policy_afinfo = { + .family = AF_INET6, + .lock = RW_LOCK_UNLOCKED, + .type_map = &xfrm6_type_map, + .dst_ops = &xfrm6_dst_ops, + .dst_lookup = xfrm6_dst_lookup, + .find_bundle = __xfrm6_find_bundle, + .bundle_create = __xfrm6_bundle_create, + .decode_session = _decode_session6, +}; + +void __init xfrm6_policy_init(void) +{ + xfrm_policy_register_afinfo(&xfrm6_policy_afinfo); +} + +void __exit xfrm6_policy_fini(void) +{ + xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo); +} + +void __init xfrm6_init(void) +{ + xfrm6_policy_init(); + xfrm6_state_init(); +} + +void __exit xfrm6_fini(void) +{ + //xfrm6_input_fini(); + xfrm6_policy_fini(); + xfrm6_state_fini(); +} Index: net/ipv6/xfrm6_state.c =================================================================== RCS file: net/ipv6/xfrm6_state.c diff -N net/ipv6/xfrm6_state.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/ipv6/xfrm6_state.c 16 Apr 2004 13:16:26 -0000 1.5.18.1 @@ -0,0 +1,134 @@ +/* + * xfrm6_state.c: based on xfrm4_state.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include +#include +#include +#include + +extern struct xfrm_state_afinfo xfrm6_state_afinfo; + +static void +__xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl, + struct xfrm_tmpl *tmpl, + xfrm_address_t *daddr, xfrm_address_t *saddr) +{ + /* Initialize temporary selector matching only + * to current session. */ + ipv6_addr_copy((struct in6_addr *)&x->sel.daddr, &fl->fl6_dst); + ipv6_addr_copy((struct in6_addr *)&x->sel.saddr, &fl->fl6_src); + x->sel.dport = fl->fl_ip_dport; + x->sel.dport_mask = ~0; + x->sel.sport = fl->fl_ip_sport; + x->sel.sport_mask = ~0; + x->sel.prefixlen_d = 128; + x->sel.prefixlen_s = 128; + x->sel.proto = fl->proto; + x->sel.ifindex = fl->oif; + x->id = tmpl->id; + if (ipv6_addr_any((struct in6_addr*)&x->id.daddr)) + memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); + memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); + if (ipv6_addr_any((struct in6_addr*)&x->props.saddr)) + memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); + x->props.mode = tmpl->mode; + x->props.reqid = tmpl->reqid; + x->props.family = AF_INET6; +} + +static struct xfrm_state * +__xfrm6_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto) +{ + unsigned h = __xfrm6_spi_hash(daddr, spi, proto); + struct xfrm_state *x; + + list_for_each_entry(x, xfrm6_state_afinfo.state_byspi+h, byspi) { + if (x->props.family == AF_INET6 && + spi == x->id.spi && + !ipv6_addr_cmp((struct in6_addr *)daddr, (struct in6_addr *)x->id.daddr.a6) && + proto == x->id.proto) { + xfrm_state_hold(x); + return x; + } + } + return NULL; +} + +static struct xfrm_state * +__xfrm6_find_acq(u8 mode, u32 reqid, u8 proto, + xfrm_address_t *daddr, xfrm_address_t *saddr, + int create) +{ + struct xfrm_state *x, *x0; + unsigned h = __xfrm6_dst_hash(daddr); + + x0 = NULL; + + list_for_each_entry(x, xfrm6_state_afinfo.state_bydst+h, bydst) { + if (x->props.family == AF_INET6 && + !ipv6_addr_cmp((struct in6_addr *)daddr, (struct in6_addr *)x->id.daddr.a6) && + mode == x->props.mode && + proto == x->id.proto && + !ipv6_addr_cmp((struct in6_addr *)saddr, (struct in6_addr *)x->props.saddr.a6) && + reqid == x->props.reqid && + x->km.state == XFRM_STATE_ACQ) { + if (!x0) + x0 = x; + if (x->id.spi) + continue; + x0 = x; + break; + } + } + if (x0) { + xfrm_state_hold(x0); + } else if (create && (x0 = xfrm_state_alloc()) != NULL) { + memcpy(x0->sel.daddr.a6, daddr, sizeof(struct in6_addr)); + memcpy(x0->sel.saddr.a6, saddr, sizeof(struct in6_addr)); + x0->sel.prefixlen_d = 128; + x0->sel.prefixlen_s = 128; + memcpy(x0->props.saddr.a6, saddr, sizeof(struct in6_addr)); + x0->km.state = XFRM_STATE_ACQ; + memcpy(x0->id.daddr.a6, daddr, sizeof(struct in6_addr)); + x0->id.proto = proto; + x0->props.family = AF_INET6; + x0->props.mode = mode; + x0->props.reqid = reqid; + x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES; + xfrm_state_hold(x0); + mod_timer(&x0->timer, jiffies + XFRM_ACQ_EXPIRES*HZ); + xfrm_state_hold(x0); + list_add_tail(&x0->bydst, xfrm6_state_afinfo.state_bydst+h); + wake_up(&km_waitq); + } + return x0; +} + +static struct xfrm_state_afinfo xfrm6_state_afinfo = { + .family = AF_INET6, + .lock = RW_LOCK_UNLOCKED, + .init_tempsel = __xfrm6_init_tempsel, + .state_lookup = __xfrm6_state_lookup, + .find_acq = __xfrm6_find_acq, +}; + +void __init xfrm6_state_init(void) +{ + xfrm_state_register_afinfo(&xfrm6_state_afinfo); +} + +void __exit xfrm6_state_fini(void) +{ + xfrm_state_unregister_afinfo(&xfrm6_state_afinfo); +} + Index: net/ipv6/netfilter/ip6t_multiport.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/ipv6/netfilter/ip6t_multiport.c,v retrieving revision 1.1.1.12 retrieving revision 1.1.1.12.2.1 diff -u -r1.1.1.12 -r1.1.1.12.2.1 --- a/net/ipv6/netfilter/ip6t_multiport.c 30 Oct 2001 23:08:12 -0000 1.1.1.12 +++ b/net/ipv6/netfilter/ip6t_multiport.c 16 Apr 2004 13:16:26 -0000 1.1.1.12.2.1 @@ -5,6 +5,7 @@ #include #include #include +#include #include #include Index: net/key/Makefile =================================================================== RCS file: net/key/Makefile diff -N net/key/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/key/Makefile 16 Apr 2004 13:16:26 -0000 1.2.18.1 @@ -0,0 +1,9 @@ +# +# Makefile for the key AF. +# + +O_TARGET := key.o + +obj-$(CONFIG_NET_KEY) += af_key.o + +include $(TOPDIR)/Rules.make Index: net/key/af_key.c =================================================================== RCS file: net/key/af_key.c diff -N net/key/af_key.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/key/af_key.c 16 Apr 2004 13:16:26 -0000 1.6.18.1 @@ -0,0 +1,2852 @@ +/* + * net/key/af_key.c An implementation of PF_KEYv2 sockets. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Maxim Giryaev + * David S. Miller + * Alexey Kuznetsov + * Kunihiro Ishiguro + * Kazunori MIYAZAWA / USAGI Project + * Derek Atkins + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x)) +#define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x)) + + +/* List of all pfkey sockets. */ +static struct sock * pfkey_table; +static DECLARE_WAIT_QUEUE_HEAD(pfkey_table_wait); +static rwlock_t pfkey_table_lock = RW_LOCK_UNLOCKED; +static atomic_t pfkey_table_users = ATOMIC_INIT(0); + +static atomic_t pfkey_socks_nr = ATOMIC_INIT(0); + +static void pfkey_sock_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->receive_queue); + + if (!sk->dead) { + printk("Attempt to release alive pfkey socket: %p\n", sk); + return; + } + + BUG_TRAP(atomic_read(&sk->rmem_alloc)==0); + BUG_TRAP(atomic_read(&sk->wmem_alloc)==0); + + kfree(pfkey_sk(sk)); + + atomic_dec(&pfkey_socks_nr); + + MOD_DEC_USE_COUNT; +} + +static void pfkey_table_grab(void) +{ + write_lock_bh(&pfkey_table_lock); + + if (atomic_read(&pfkey_table_users)) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&pfkey_table_wait, &wait); + for(;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&pfkey_table_users) == 0) + break; + write_unlock_bh(&pfkey_table_lock); + schedule(); + write_lock_bh(&pfkey_table_lock); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&pfkey_table_wait, &wait); + } +} + +static __inline__ void pfkey_table_ungrab(void) +{ + write_unlock_bh(&pfkey_table_lock); + wake_up(&pfkey_table_wait); +} + +static __inline__ void pfkey_lock_table(void) +{ + /* read_lock() synchronizes us to pfkey_table_grab */ + + read_lock(&pfkey_table_lock); + atomic_inc(&pfkey_table_users); + read_unlock(&pfkey_table_lock); +} + +static __inline__ void pfkey_unlock_table(void) +{ + if (atomic_dec_and_test(&pfkey_table_users)) + wake_up(&pfkey_table_wait); +} + + +static struct proto_ops pfkey_ops; + +static void pfkey_insert(struct sock *sk) +{ + pfkey_table_grab(); + sk->next = pfkey_table; + pfkey_table = sk; + sock_hold(sk); + pfkey_table_ungrab(); +} + +static void pfkey_remove(struct sock *sk) +{ + struct sock **skp; + + pfkey_table_grab(); + for (skp = &pfkey_table; *skp; skp = &((*skp)->next)) { + if (*skp == sk) { + *skp = sk->next; + __sock_put(sk); + break; + } + } + pfkey_table_ungrab(); +} + +static int pfkey_create(struct socket *sock, int protocol) +{ + struct sock *sk; + struct pfkey_opt *pfk; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + if (protocol != PF_KEY_V2) + return -EPROTONOSUPPORT; + + MOD_INC_USE_COUNT; + + err = -ENOMEM; + sk = sk_alloc(PF_KEY, GFP_KERNEL, 1); + if (sk == NULL) + goto out; + + sock->ops = &pfkey_ops; + sock_init_data(sock, sk); + + err = -ENOMEM; + pfk = pfkey_sk(sk) = kmalloc(sizeof(*pfk), GFP_KERNEL); + if (!pfk) { + sk_free(sk); + goto out; + } + memset(pfk, 0, sizeof(*pfk)); + + sk->family = PF_KEY; + sk->destruct = pfkey_sock_destruct; + + atomic_inc(&pfkey_socks_nr); + + pfkey_insert(sk); + + return 0; + +out: + MOD_DEC_USE_COUNT; + return err; +} + +static int pfkey_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + pfkey_remove(sk); + + sock_orphan(sk); + sock->sk = NULL; + skb_queue_purge(&sk->write_queue); + sock_put(sk); + + return 0; +} + +static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, + int allocation, struct sock *sk) +{ + int err = -ENOBUFS; + + sock_hold(sk); + if (*skb2 == NULL) { + if (atomic_read(&skb->users) != 1) { + *skb2 = skb_clone(skb, allocation); + } else { + *skb2 = skb; + atomic_inc(&skb->users); + } + } + if (*skb2 != NULL) { + if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) { + skb_orphan(*skb2); + skb_set_owner_r(*skb2, sk); + skb_queue_tail(&sk->receive_queue, *skb2); + sk->data_ready(sk, (*skb2)->len); + *skb2 = NULL; + err = 0; + } + } + sock_put(sk); + return err; +} + +/* Send SKB to all pfkey sockets matching selected criteria. */ +#define BROADCAST_ALL 0 +#define BROADCAST_ONE 1 +#define BROADCAST_REGISTERED 2 +#define BROADCAST_PROMISC_ONLY 4 +static int pfkey_broadcast(struct sk_buff *skb, int allocation, + int broadcast_flags, struct sock *one_sk) +{ + struct sock *sk; + struct sk_buff *skb2 = NULL; + int err = -ESRCH; + + /* XXX Do we need something like netlink_overrun? I think + * XXX PF_KEY socket apps will not mind current behavior. + */ + if (!skb) + return -ENOMEM; + + pfkey_lock_table(); + for (sk = pfkey_table; sk; sk = sk->next) { + struct pfkey_opt *pfk = pfkey_sk(sk); + int err2; + + /* Yes, it means that if you are meant to receive this + * pfkey message you receive it twice as promiscuous + * socket. + */ + if (pfk->promisc) + pfkey_broadcast_one(skb, &skb2, allocation, sk); + + /* the exact target will be processed later */ + if (sk == one_sk) + continue; + if (broadcast_flags != BROADCAST_ALL) { + if (broadcast_flags & BROADCAST_PROMISC_ONLY) + continue; + if ((broadcast_flags & BROADCAST_REGISTERED) && + !pfk->registered) + continue; + if (broadcast_flags & BROADCAST_ONE) + continue; + } + + err2 = pfkey_broadcast_one(skb, &skb2, allocation, sk); + + /* Error is cleare after succecful sending to at least one + * registered KM */ + if ((broadcast_flags & BROADCAST_REGISTERED) && err) + err = err2; + } + pfkey_unlock_table(); + + if (one_sk != NULL) + err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); + + if (skb2) + kfree_skb(skb2); + kfree_skb(skb); + return err; +} + +static inline void pfkey_hdr_dup(struct sadb_msg *new, struct sadb_msg *orig) +{ + *new = *orig; +} + +static int pfkey_error(struct sadb_msg *orig, int err, struct sock *sk) +{ + struct sk_buff *skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); + struct sadb_msg *hdr; + + if (!skb) + return -ENOBUFS; + + /* Woe be to the platform trying to support PFKEY yet + * having normal errnos outside the 1-255 range, inclusive. + */ + err = -err; + if (err == ERESTARTSYS || + err == ERESTARTNOHAND || + err == ERESTARTNOINTR) + err = EINTR; + if (err >= 512) + err = EINVAL; + if (err <= 0 || err >= 256) + BUG(); + + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + pfkey_hdr_dup(hdr, orig); + hdr->sadb_msg_errno = (uint8_t) err; + hdr->sadb_msg_len = (sizeof(struct sadb_msg) / + sizeof(uint64_t)); + + pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk); + + return 0; +} + +static u8 sadb_ext_min_len[] = { + [SADB_EXT_RESERVED] = (u8) 0, + [SADB_EXT_SA] = (u8) sizeof(struct sadb_sa), + [SADB_EXT_LIFETIME_CURRENT] = (u8) sizeof(struct sadb_lifetime), + [SADB_EXT_LIFETIME_HARD] = (u8) sizeof(struct sadb_lifetime), + [SADB_EXT_LIFETIME_SOFT] = (u8) sizeof(struct sadb_lifetime), + [SADB_EXT_ADDRESS_SRC] = (u8) sizeof(struct sadb_address), + [SADB_EXT_ADDRESS_DST] = (u8) sizeof(struct sadb_address), + [SADB_EXT_ADDRESS_PROXY] = (u8) sizeof(struct sadb_address), + [SADB_EXT_KEY_AUTH] = (u8) sizeof(struct sadb_key), + [SADB_EXT_KEY_ENCRYPT] = (u8) sizeof(struct sadb_key), + [SADB_EXT_IDENTITY_SRC] = (u8) sizeof(struct sadb_ident), + [SADB_EXT_IDENTITY_DST] = (u8) sizeof(struct sadb_ident), + [SADB_EXT_SENSITIVITY] = (u8) sizeof(struct sadb_sens), + [SADB_EXT_PROPOSAL] = (u8) sizeof(struct sadb_prop), + [SADB_EXT_SUPPORTED_AUTH] = (u8) sizeof(struct sadb_supported), + [SADB_EXT_SUPPORTED_ENCRYPT] = (u8) sizeof(struct sadb_supported), + [SADB_EXT_SPIRANGE] = (u8) sizeof(struct sadb_spirange), + [SADB_X_EXT_KMPRIVATE] = (u8) sizeof(struct sadb_x_kmprivate), + [SADB_X_EXT_POLICY] = (u8) sizeof(struct sadb_x_policy), + [SADB_X_EXT_SA2] = (u8) sizeof(struct sadb_x_sa2), + [SADB_X_EXT_NAT_T_TYPE] = (u8) sizeof(struct sadb_x_nat_t_type), + [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), + [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), + [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), +}; + +/* Verify sadb_address_{len,prefixlen} against sa_family. */ +static int verify_address_len(void *p) +{ + struct sadb_address *sp = p; + struct sockaddr *addr = (struct sockaddr *)(sp + 1); + struct sockaddr_in *sin; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; +#endif + int len; + + switch (addr->sa_family) { + case AF_INET: + len = sizeof(*sp) + sizeof(*sin) + (sizeof(uint64_t) - 1); + len /= sizeof(uint64_t); + if (sp->sadb_address_len != len || + sp->sadb_address_prefixlen > 32) + return -EINVAL; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + len = sizeof(*sp) + sizeof(*sin6) + (sizeof(uint64_t) - 1); + len /= sizeof(uint64_t); + if (sp->sadb_address_len != len || + sp->sadb_address_prefixlen > 128) + return -EINVAL; + break; +#endif + default: + /* It is user using kernel to keep track of security + * associations for another protocol, such as + * OSPF/RSVP/RIPV2/MIP. It is user's job to verify + * lengths. + * + * XXX Actually, association/policy database is not yet + * XXX able to cope with arbitrary sockaddr families. + * XXX When it can, remove this -EINVAL. -DaveM + */ + return -EINVAL; + break; + }; + + return 0; +} + +static int present_and_same_family(struct sadb_address *src, + struct sadb_address *dst) +{ + struct sockaddr *s_addr, *d_addr; + + if (!src || !dst) + return 0; + + s_addr = (struct sockaddr *)(src + 1); + d_addr = (struct sockaddr *)(dst + 1); + if (s_addr->sa_family != d_addr->sa_family) + return 0; + if (s_addr->sa_family != AF_INET +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + && s_addr->sa_family != AF_INET6 +#endif + ) + return 0; + + return 1; +} + +static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + char *p = (char *) hdr; + int len = skb->len; + + len -= sizeof(*hdr); + p += sizeof(*hdr); + while (len > 0) { + struct sadb_ext *ehdr = (struct sadb_ext *) p; + uint16_t ext_type; + int ext_len; + + ext_len = ehdr->sadb_ext_len; + ext_len *= sizeof(uint64_t); + ext_type = ehdr->sadb_ext_type; + if (ext_len < sizeof(uint64_t) || + ext_len > len || + ext_type == SADB_EXT_RESERVED) + return -EINVAL; + + if (ext_type <= SADB_EXT_MAX) { + int min = (int) sadb_ext_min_len[ext_type]; + if (ext_len < min) + return -EINVAL; + if (ext_hdrs[ext_type-1] != NULL) + return -EINVAL; + if (ext_type == SADB_EXT_ADDRESS_SRC || + ext_type == SADB_EXT_ADDRESS_DST || + ext_type == SADB_EXT_ADDRESS_PROXY || + ext_type == SADB_X_EXT_NAT_T_OA) { + if (verify_address_len(p)) + return -EINVAL; + } + ext_hdrs[ext_type-1] = p; + } + p += ext_len; + len -= ext_len; + } + + return 0; +} + +static uint16_t +pfkey_satype2proto(uint8_t satype) +{ + switch (satype) { + case SADB_SATYPE_UNSPEC: + return IPSEC_PROTO_ANY; + case SADB_SATYPE_AH: + return IPPROTO_AH; + case SADB_SATYPE_ESP: + return IPPROTO_ESP; + case SADB_X_SATYPE_IPCOMP: + return IPPROTO_COMP; + break; + default: + return 0; + } + /* NOTREACHED */ +} + +static uint8_t +pfkey_proto2satype(uint16_t proto) +{ + switch (proto) { + case IPPROTO_AH: + return SADB_SATYPE_AH; + case IPPROTO_ESP: + return SADB_SATYPE_ESP; + case IPPROTO_COMP: + return SADB_X_SATYPE_IPCOMP; + break; + default: + return 0; + } + /* NOTREACHED */ +} + +/* BTW, this scheme means that there is no way with PFKEY2 sockets to + * say specifically 'just raw sockets' as we encode them as 255. + */ + +static uint8_t pfkey_proto_to_xfrm(uint8_t proto) +{ + return (proto == IPSEC_PROTO_ANY ? 0 : proto); +} + +static uint8_t pfkey_proto_from_xfrm(uint8_t proto) +{ + return (proto ? proto : IPSEC_PROTO_ANY); +} + +static int pfkey_sadb_addr2xfrm_addr(struct sadb_address *addr, + xfrm_address_t *xaddr) +{ + switch (((struct sockaddr*)(addr + 1))->sa_family) { + case AF_INET: + xaddr->a4 = + ((struct sockaddr_in *)(addr + 1))->sin_addr.s_addr; + return AF_INET; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + memcpy(xaddr->a6, + &((struct sockaddr_in6 *)(addr + 1))->sin6_addr, + sizeof(struct in6_addr)); + return AF_INET6; +#endif + default: + return 0; + } + /* NOTREACHED */ +} + +static struct xfrm_state *pfkey_xfrm_state_lookup(struct sadb_msg *hdr, void **ext_hdrs) +{ + struct sadb_sa *sa; + struct sadb_address *addr; + uint16_t proto; + unsigned short family; + xfrm_address_t *xaddr; + + sa = (struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1]; + if (sa == NULL) + return NULL; + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return NULL; + + /* sadb_address_len should be checked by caller */ + addr = (struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1]; + if (addr == NULL) + return NULL; + + family = ((struct sockaddr *)(addr + 1))->sa_family; + switch (family) { + case AF_INET: + xaddr = (xfrm_address_t *)&((struct sockaddr_in *)(addr + 1))->sin_addr; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + xaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(addr + 1))->sin6_addr; + break; +#endif + default: + xaddr = NULL; + } + + if (!xaddr) + return NULL; + + return xfrm_state_lookup(xaddr, sa->sadb_sa_spi, proto, family); +} + +#define PFKEY_ALIGN8(a) (1 + (((a) - 1) | (8 - 1))) +static int +pfkey_sockaddr_size(sa_family_t family) +{ + switch (family) { + case AF_INET: + return PFKEY_ALIGN8(sizeof(struct sockaddr_in)); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + return PFKEY_ALIGN8(sizeof(struct sockaddr_in6)); +#endif + default: + return 0; + } + /* NOTREACHED */ +} + +static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, int hsc) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + struct sadb_sa *sa; + struct sadb_lifetime *lifetime; + struct sadb_address *addr; + struct sadb_key *key; + struct sadb_x_sa2 *sa2; + struct sockaddr_in *sin; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; +#endif + int size; + int auth_key_size = 0; + int encrypt_key_size = 0; + int sockaddr_size; + struct xfrm_encap_tmpl *natt = NULL; + + /* address family check */ + sockaddr_size = pfkey_sockaddr_size(x->props.family); + if (!sockaddr_size) + ERR_PTR(-EINVAL); + + /* base, SA, (lifetime (HSC),) address(SD), (address(P),) + key(AE), (identity(SD),) (sensitivity)> */ + size = sizeof(struct sadb_msg) +sizeof(struct sadb_sa) + + sizeof(struct sadb_lifetime) + + ((hsc & 1) ? sizeof(struct sadb_lifetime) : 0) + + ((hsc & 2) ? sizeof(struct sadb_lifetime) : 0) + + sizeof(struct sadb_address)*2 + + sockaddr_size*2 + + sizeof(struct sadb_x_sa2); + /* identity & sensitivity */ + + if ((x->props.family == AF_INET && + x->sel.saddr.a4 != x->props.saddr.a4) +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + || (x->props.family == AF_INET6 && + memcmp (x->sel.saddr.a6, x->props.saddr.a6, sizeof (struct in6_addr))) +#endif + ) + size += sizeof(struct sadb_address) + sockaddr_size; + + if (add_keys) { + if (x->aalg && x->aalg->alg_key_len) { + auth_key_size = + PFKEY_ALIGN8((x->aalg->alg_key_len + 7) / 8); + size += sizeof(struct sadb_key) + auth_key_size; + } + if (x->ealg && x->ealg->alg_key_len) { + encrypt_key_size = + PFKEY_ALIGN8((x->ealg->alg_key_len+7) / 8); + size += sizeof(struct sadb_key) + encrypt_key_size; + } + } + if (x->encap) + natt = x->encap; + + if (natt && natt->encap_type) { + size += sizeof(struct sadb_x_nat_t_type); + size += sizeof(struct sadb_x_nat_t_port); + size += sizeof(struct sadb_x_nat_t_port); + } + + skb = alloc_skb(size + 16, GFP_ATOMIC); + if (skb == NULL) + return ERR_PTR(-ENOBUFS); + + /* call should fill header later */ + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + memset(hdr, 0, size); /* XXX do we need this ? */ + hdr->sadb_msg_len = size / sizeof(uint64_t); + + /* sa */ + sa = (struct sadb_sa *) skb_put(skb, sizeof(struct sadb_sa)); + sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); + sa->sadb_sa_exttype = SADB_EXT_SA; + sa->sadb_sa_spi = x->id.spi; + sa->sadb_sa_replay = x->props.replay_window; + sa->sadb_sa_state = SADB_SASTATE_DYING; + if (x->km.state == XFRM_STATE_VALID && !x->km.dying) + sa->sadb_sa_state = SADB_SASTATE_MATURE; + else if (x->km.state == XFRM_STATE_ACQ) + sa->sadb_sa_state = SADB_SASTATE_LARVAL; + else if (x->km.state == XFRM_STATE_EXPIRED) + sa->sadb_sa_state = SADB_SASTATE_DEAD; + sa->sadb_sa_auth = 0; + if (x->aalg) { + struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name); + sa->sadb_sa_auth = a ? a->desc.sadb_alg_id : 0; + } + sa->sadb_sa_encrypt = 0; + BUG_ON(x->ealg && x->calg); + if (x->ealg) { + struct xfrm_algo_desc *a = xfrm_ealg_get_byname(x->ealg->alg_name); + sa->sadb_sa_encrypt = a ? a->desc.sadb_alg_id : 0; + } + /* KAME compatible: sadb_sa_encrypt is overloaded with calg id */ + if (x->calg) { + struct xfrm_algo_desc *a = xfrm_calg_get_byname(x->calg->alg_name); + sa->sadb_sa_encrypt = a ? a->desc.sadb_alg_id : 0; + } + + sa->sadb_sa_flags = 0; + if (x->props.flags & XFRM_STATE_NOECN) + sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN; + + /* hard time */ + if (hsc & 2) { + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; + lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.hard_packet_limit); + lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.hard_byte_limit); + lifetime->sadb_lifetime_addtime = x->lft.hard_add_expires_seconds; + lifetime->sadb_lifetime_usetime = x->lft.hard_use_expires_seconds; + } + /* soft time */ + if (hsc & 1) { + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; + lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.soft_packet_limit); + lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.soft_byte_limit); + lifetime->sadb_lifetime_addtime = x->lft.soft_add_expires_seconds; + lifetime->sadb_lifetime_usetime = x->lft.soft_use_expires_seconds; + } + /* current time */ + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; + lifetime->sadb_lifetime_allocations = x->curlft.packets; + lifetime->sadb_lifetime_bytes = x->curlft.bytes; + lifetime->sadb_lifetime_addtime = x->curlft.add_time; + lifetime->sadb_lifetime_usetime = x->curlft.use_time; + /* src address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; + /* "if the ports are non-zero, then the sadb_address_proto field, + normally zero, MUST be filled in with the transport + protocol's number." - RFC2367 */ + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + if (x->props.family == AF_INET) { + addr->sadb_address_prefixlen = 32; + + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = x->props.saddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (x->props.family == AF_INET6) { + addr->sadb_address_prefixlen = 128; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, x->props.saddr.a6, + sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + /* dst address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; + addr->sadb_address_proto = 0; + addr->sadb_address_prefixlen = 32; /* XXX */ + addr->sadb_address_reserved = 0; + if (x->props.family == AF_INET) { + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = x->id.daddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + + if (x->sel.saddr.a4 != x->props.saddr.a4) { + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY; + addr->sadb_address_proto = + pfkey_proto_from_xfrm(x->sel.proto); + addr->sadb_address_prefixlen = x->sel.prefixlen_s; + addr->sadb_address_reserved = 0; + + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = x->sel.saddr.a4; + sin->sin_port = x->sel.sport; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (x->props.family == AF_INET6) { + addr->sadb_address_prefixlen = 128; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, x->id.daddr.a6, sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + + if (memcmp (x->sel.saddr.a6, x->props.saddr.a6, + sizeof(struct in6_addr))) { + addr = (struct sadb_address *) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY; + addr->sadb_address_proto = + pfkey_proto_from_xfrm(x->sel.proto); + addr->sadb_address_prefixlen = x->sel.prefixlen_s; + addr->sadb_address_reserved = 0; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = x->sel.sport; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, x->sel.saddr.a6, + sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } + } +#endif + else + BUG(); + + /* auth key */ + if (add_keys && auth_key_size) { + key = (struct sadb_key *) skb_put(skb, + sizeof(struct sadb_key)+auth_key_size); + key->sadb_key_len = (sizeof(struct sadb_key) + auth_key_size) / + sizeof(uint64_t); + key->sadb_key_exttype = SADB_EXT_KEY_AUTH; + key->sadb_key_bits = x->aalg->alg_key_len; + key->sadb_key_reserved = 0; + memcpy(key + 1, x->aalg->alg_key, (x->aalg->alg_key_len+7)/8); + } + /* encrypt key */ + if (add_keys && encrypt_key_size) { + key = (struct sadb_key *) skb_put(skb, + sizeof(struct sadb_key)+encrypt_key_size); + key->sadb_key_len = (sizeof(struct sadb_key) + + encrypt_key_size) / sizeof(uint64_t); + key->sadb_key_exttype = SADB_EXT_KEY_ENCRYPT; + key->sadb_key_bits = x->ealg->alg_key_len; + key->sadb_key_reserved = 0; + memcpy(key + 1, x->ealg->alg_key, + (x->ealg->alg_key_len+7)/8); + } + + /* sa */ + sa2 = (struct sadb_x_sa2 *) skb_put(skb, sizeof(struct sadb_x_sa2)); + sa2->sadb_x_sa2_len = sizeof(struct sadb_x_sa2)/sizeof(uint64_t); + sa2->sadb_x_sa2_exttype = SADB_X_EXT_SA2; + sa2->sadb_x_sa2_mode = x->props.mode + 1; + sa2->sadb_x_sa2_reserved1 = 0; + sa2->sadb_x_sa2_reserved2 = 0; + sa2->sadb_x_sa2_sequence = 0; + sa2->sadb_x_sa2_reqid = x->props.reqid; + + if (natt && natt->encap_type) { + struct sadb_x_nat_t_type *n_type; + struct sadb_x_nat_t_port *n_port; + + /* type */ + n_type = (struct sadb_x_nat_t_type*) skb_put(skb, sizeof(*n_type)); + n_type->sadb_x_nat_t_type_len = sizeof(*n_type)/sizeof(uint64_t); + n_type->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE; + n_type->sadb_x_nat_t_type_type = natt->encap_type; + n_type->sadb_x_nat_t_type_reserved[0] = 0; + n_type->sadb_x_nat_t_type_reserved[1] = 0; + n_type->sadb_x_nat_t_type_reserved[2] = 0; + + /* source port */ + n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port)); + n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); + n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; + n_port->sadb_x_nat_t_port_port = natt->encap_sport; + n_port->sadb_x_nat_t_port_reserved = 0; + + /* dest port */ + n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port)); + n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); + n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; + n_port->sadb_x_nat_t_port_port = natt->encap_dport; + n_port->sadb_x_nat_t_port_reserved = 0; + } + + return skb; +} + +static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr, + void **ext_hdrs) +{ + struct xfrm_state *x; + struct sadb_lifetime *lifetime; + struct sadb_sa *sa; + struct sadb_key *key; + uint16_t proto; + + + sa = (struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1]; + if (!sa || + !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1])) + return ERR_PTR(-EINVAL); + if (hdr->sadb_msg_satype == SADB_SATYPE_ESP && + !ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]) + return ERR_PTR(-EINVAL); + if (hdr->sadb_msg_satype == SADB_SATYPE_AH && + !ext_hdrs[SADB_EXT_KEY_AUTH-1]) + return ERR_PTR(-EINVAL); + if (!!ext_hdrs[SADB_EXT_LIFETIME_HARD-1] != + !!ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) + return ERR_PTR(-EINVAL); + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return ERR_PTR(-EINVAL); + + /* RFC2367: + + Only SADB_SASTATE_MATURE SAs may be submitted in an SADB_ADD message. + SADB_SASTATE_LARVAL SAs are created by SADB_GETSPI and it is not + sensible to add a new SA in the DYING or SADB_SASTATE_DEAD state. + Therefore, the sadb_sa_state field of all submitted SAs MUST be + SADB_SASTATE_MATURE and the kernel MUST return an error if this is + not true. + + However, KAME setkey always uses SADB_SASTATE_LARVAL. + Hence, we have to _ignore_ sadb_sa_state, which is also reasonable. + */ + if (sa->sadb_sa_auth > SADB_AALG_MAX || + (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP && + sa->sadb_sa_encrypt > SADB_X_CALG_MAX) || + sa->sadb_sa_encrypt > SADB_EALG_MAX) + return ERR_PTR(-EINVAL); + key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1]; + if (key != NULL && + sa->sadb_sa_auth != SADB_X_AALG_NULL && + ((key->sadb_key_bits+7) / 8 == 0 || + (key->sadb_key_bits+7) / 8 > key->sadb_key_len * sizeof(uint64_t))) + return ERR_PTR(-EINVAL); + key = ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; + if (key != NULL && + sa->sadb_sa_encrypt != SADB_EALG_NULL && + ((key->sadb_key_bits+7) / 8 == 0 || + (key->sadb_key_bits+7) / 8 > key->sadb_key_len * sizeof(uint64_t))) + return ERR_PTR(-EINVAL); + + x = xfrm_state_alloc(); + if (x == NULL) + return ERR_PTR(-ENOBUFS); + + x->id.proto = proto; + x->id.spi = sa->sadb_sa_spi; + x->props.replay_window = sa->sadb_sa_replay; + if (sa->sadb_sa_flags & SADB_SAFLAGS_NOECN) + x->props.flags |= XFRM_STATE_NOECN; + + lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1]; + if (lifetime != NULL) { + x->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); + x->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); + x->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; + x->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; + } + lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]; + if (lifetime != NULL) { + x->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); + x->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); + x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; + x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; + } + key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1]; + if (sa->sadb_sa_auth) { + int keysize = 0; + struct xfrm_algo_desc *a = xfrm_aalg_get_byid(sa->sadb_sa_auth); + if (!a) + goto out; + if (key) + keysize = (key->sadb_key_bits + 7) / 8; + x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL); + if (!x->aalg) + goto out; + strcpy(x->aalg->alg_name, a->name); + x->aalg->alg_key_len = 0; + if (key) { + x->aalg->alg_key_len = key->sadb_key_bits; + memcpy(x->aalg->alg_key, key+1, keysize); + } + x->props.aalgo = sa->sadb_sa_auth; + /* x->algo.flags = sa->sadb_sa_flags; */ + } + if (sa->sadb_sa_encrypt) { + if (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP) { + struct xfrm_algo_desc *a = xfrm_calg_get_byid(sa->sadb_sa_encrypt); + if (!a) + goto out; + x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL); + if (!x->calg) + goto out; + strcpy(x->calg->alg_name, a->name); + x->props.calgo = sa->sadb_sa_encrypt; + } else { + int keysize = 0; + struct xfrm_algo_desc *a = xfrm_ealg_get_byid(sa->sadb_sa_encrypt); + if (!a) + goto out; + key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; + if (key) + keysize = (key->sadb_key_bits + 7) / 8; + x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL); + if (!x->ealg) + goto out; + strcpy(x->ealg->alg_name, a->name); + x->ealg->alg_key_len = 0; + if (key) { + x->ealg->alg_key_len = key->sadb_key_bits; + memcpy(x->ealg->alg_key, key+1, keysize); + } + x->props.ealgo = sa->sadb_sa_encrypt; + } + } + /* x->algo.flags = sa->sadb_sa_flags; */ + + x->props.family = pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + &x->props.saddr); + if (!x->props.family) + goto out; + pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1], + &x->id.daddr); + + if (ext_hdrs[SADB_X_EXT_SA2-1]) { + struct sadb_x_sa2 *sa2 = (void*)ext_hdrs[SADB_X_EXT_SA2-1]; + x->props.mode = sa2->sadb_x_sa2_mode; + if (x->props.mode) + x->props.mode--; + x->props.reqid = sa2->sadb_x_sa2_reqid; + } + + if (ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]) { + struct sadb_address *addr = ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]; + + /* Nobody uses this, but we try. */ + pfkey_sadb_addr2xfrm_addr(addr, &x->sel.saddr); + x->sel.prefixlen_s = addr->sadb_address_prefixlen; + } + + if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) { + struct sadb_x_nat_t_type* n_type; + struct xfrm_encap_tmpl *natt; + + x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL); + if (!x->encap) + goto out; + + natt = x->encap; + n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]; + natt->encap_type = n_type->sadb_x_nat_t_type_type; + + if (ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]) { + struct sadb_x_nat_t_port* n_port = + ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]; + natt->encap_sport = n_port->sadb_x_nat_t_port_port; + } + if (ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]) { + struct sadb_x_nat_t_port* n_port = + ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]; + natt->encap_dport = n_port->sadb_x_nat_t_port_port; + } + } + + x->type = xfrm_get_type(proto, x->props.family); + if (x->type == NULL) + goto out; + if (x->type->init_state(x, NULL)) + goto out; + x->km.seq = hdr->sadb_msg_seq; + x->km.state = XFRM_STATE_VALID; + return x; + +out: + x->km.state = XFRM_STATE_DEAD; + xfrm_state_put(x); + return ERR_PTR(-ENOBUFS); +} + +static int pfkey_reserved(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + return -EOPNOTSUPP; +} + +static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct sk_buff *resp_skb; + struct sadb_x_sa2 *sa2; + struct sadb_address *saddr, *daddr; + struct sadb_msg *out_hdr; + struct xfrm_state *x = NULL; + u8 mode; + u32 reqid; + u8 proto; + unsigned short family; + xfrm_address_t *xsaddr = NULL, *xdaddr = NULL; + + if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1])) + return -EINVAL; + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return -EINVAL; + + if ((sa2 = ext_hdrs[SADB_X_EXT_SA2-1]) != NULL) { + mode = sa2->sadb_x_sa2_mode - 1; + reqid = sa2->sadb_x_sa2_reqid; + } else { + mode = 0; + reqid = 0; + } + + saddr = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; + daddr = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; + + family = ((struct sockaddr *)(saddr + 1))->sa_family; + switch (family) { + case AF_INET: + xdaddr = (xfrm_address_t *)&((struct sockaddr_in *)(daddr + 1))->sin_addr.s_addr; + xsaddr = (xfrm_address_t *)&((struct sockaddr_in *)(saddr + 1))->sin_addr.s_addr; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + xdaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(daddr + 1))->sin6_addr; + xsaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(saddr + 1))->sin6_addr; + break; +#endif + } + if (xdaddr) + x = xfrm_find_acq(mode, reqid, proto, xdaddr, xsaddr, 1, family); + + if (x == NULL) + return -ENOENT; + + resp_skb = ERR_PTR(-ENOENT); + + spin_lock_bh(&x->lock); + if (x->km.state != XFRM_STATE_DEAD) { + struct sadb_spirange *range = ext_hdrs[SADB_EXT_SPIRANGE-1]; + u32 min_spi, max_spi; + + if (range != NULL) { + min_spi = range->sadb_spirange_min; + max_spi = range->sadb_spirange_max; + } else { + min_spi = htonl(0x100); + max_spi = htonl(0x0fffffff); + } + xfrm_alloc_spi(x, min_spi, max_spi); + if (x->id.spi) + resp_skb = pfkey_xfrm_state2msg(x, 0, 3); + } + spin_unlock_bh(&x->lock); + + if (IS_ERR(resp_skb)) { + xfrm_state_put(x); + return PTR_ERR(resp_skb); + } + + out_hdr = (struct sadb_msg *) resp_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = SADB_GETSPI; + out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + + xfrm_state_put(x); + + pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk); + + return 0; +} + +static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct xfrm_state *x; + + if (hdr->sadb_msg_len != sizeof(struct sadb_msg)/8) + return -EOPNOTSUPP; + + if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0) + return 0; + + x = xfrm_find_acq_byseq(hdr->sadb_msg_seq); + if (x == NULL) + return 0; + + spin_lock_bh(&x->lock); + if (x->km.state == XFRM_STATE_ACQ) { + x->km.state = XFRM_STATE_ERROR; + wake_up(&km_waitq); + } + spin_unlock_bh(&x->lock); + xfrm_state_put(x); + return 0; +} + + +static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + struct xfrm_state *x; + int err; + + xfrm_probe_algs(); + + x = pfkey_msg2xfrm_state(hdr, ext_hdrs); + if (IS_ERR(x)) + return PTR_ERR(x); + + if (hdr->sadb_msg_type == SADB_ADD) + err = xfrm_state_add(x); + else + err = xfrm_state_update(x); + + if (err < 0) { + x->km.state = XFRM_STATE_DEAD; + xfrm_state_put(x); + return err; + } + + out_skb = pfkey_xfrm_state2msg(x, 0, 3); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); /* XXX Should we return 0 here ? */ + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = hdr->sadb_msg_type; + out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk); + + return 0; +} + +static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct xfrm_state *x; + + if (!ext_hdrs[SADB_EXT_SA-1] || + !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1])) + return -EINVAL; + + x = pfkey_xfrm_state_lookup(hdr, ext_hdrs); + if (x == NULL) + return -ESRCH; + + if (xfrm_state_kern(x)) { + xfrm_state_put(x); + return -EPERM; + } + + xfrm_state_delete(x); + xfrm_state_put(x); + + pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, + BROADCAST_ALL, sk); + + return 0; +} + +static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + struct xfrm_state *x; + + if (!ext_hdrs[SADB_EXT_SA-1] || + !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1])) + return -EINVAL; + + x = pfkey_xfrm_state_lookup(hdr, ext_hdrs); + if (x == NULL) + return -ESRCH; + + out_skb = pfkey_xfrm_state2msg(x, 1, 3); + xfrm_state_put(x); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = SADB_DUMP; + out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk); + + return 0; +} + +static struct sk_buff *compose_sadb_supported(struct sadb_msg *orig, int allocation) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + int len, auth_len, enc_len, i; + + auth_len = xfrm_count_auth_supported(); + if (auth_len) { + auth_len *= sizeof(struct sadb_alg); + auth_len += sizeof(struct sadb_supported); + } + + enc_len = xfrm_count_enc_supported(); + if (enc_len) { + enc_len *= sizeof(struct sadb_alg); + enc_len += sizeof(struct sadb_supported); + } + + len = enc_len + auth_len + sizeof(struct sadb_msg); + + skb = alloc_skb(len + 16, allocation); + if (!skb) + goto out_put_algs; + + hdr = (struct sadb_msg *) skb_put(skb, sizeof(*hdr)); + pfkey_hdr_dup(hdr, orig); + hdr->sadb_msg_errno = 0; + hdr->sadb_msg_len = len / sizeof(uint64_t); + + if (auth_len) { + struct sadb_supported *sp; + struct sadb_alg *ap; + + sp = (struct sadb_supported *) skb_put(skb, auth_len); + ap = (struct sadb_alg *) (sp + 1); + + sp->sadb_supported_len = auth_len / sizeof(uint64_t); + sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH; + + for (i = 0; ; i++) { + struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); + if (!aalg) + break; + if (aalg->available) + *ap++ = aalg->desc; + } + } + + if (enc_len) { + struct sadb_supported *sp; + struct sadb_alg *ap; + + sp = (struct sadb_supported *) skb_put(skb, enc_len); + ap = (struct sadb_alg *) (sp + 1); + + sp->sadb_supported_len = enc_len / sizeof(uint64_t); + sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT; + + for (i = 0; ; i++) { + struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); + if (!ealg) + break; + if (ealg->available) + *ap++ = ealg->desc; + } + } + +out_put_algs: + return skb; +} + +static int pfkey_register(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct pfkey_opt *pfk = pfkey_sk(sk); + struct sk_buff *supp_skb; + + if (hdr->sadb_msg_satype > SADB_SATYPE_MAX) + return -EINVAL; + + if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) { + if (pfk->registered&(1<sadb_msg_satype)) + return -EEXIST; + pfk->registered |= (1<sadb_msg_satype); + } + + xfrm_probe_algs(); + + supp_skb = compose_sadb_supported(hdr, GFP_KERNEL); + if (!supp_skb) { + if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) + pfk->registered &= ~(1<sadb_msg_satype); + + return -ENOBUFS; + } + + pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk); + + return 0; +} + +static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + unsigned proto; + struct sk_buff *skb_out; + struct sadb_msg *hdr_out; + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return -EINVAL; + + skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); + if (!skb_out) + return -ENOBUFS; + + xfrm_state_flush(proto); + + hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg)); + pfkey_hdr_dup(hdr_out, hdr); + hdr_out->sadb_msg_errno = (uint8_t) 0; + hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); + + pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL); + + return 0; +} + +struct pfkey_dump_data +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + struct sock *sk; +}; + +static int dump_sa(struct xfrm_state *x, int count, void *ptr) +{ + struct pfkey_dump_data *data = ptr; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + + out_skb = pfkey_xfrm_state2msg(x, 1, 3); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = data->hdr->sadb_msg_version; + out_hdr->sadb_msg_type = SADB_DUMP; + out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = count; + out_hdr->sadb_msg_pid = data->hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, data->sk); + return 0; +} + +static int pfkey_dump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + u8 proto; + struct pfkey_dump_data data = { .skb = skb, .hdr = hdr, .sk = sk }; + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return -EINVAL; + + return xfrm_state_walk(proto, dump_sa, &data); +} + +static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct pfkey_opt *pfk = pfkey_sk(sk); + int satype = hdr->sadb_msg_satype; + + if (hdr->sadb_msg_len == (sizeof(*hdr) / sizeof(uint64_t))) { + /* XXX we mangle packet... */ + hdr->sadb_msg_errno = 0; + if (satype != 0 && satype != 1) + return -EINVAL; + pfk->promisc = satype; + } + pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, BROADCAST_ALL, NULL); + return 0; +} + +static int check_reqid(struct xfrm_policy *xp, int dir, int count, void *ptr) +{ + int i; + u32 reqid = *(u32*)ptr; + + for (i=0; ixfrm_nr; i++) { + if (xp->xfrm_vec[i].reqid == reqid) + return -EEXIST; + } + return 0; +} + +static u32 gen_reqid(void) +{ + u32 start; + static u32 reqid = IPSEC_MANUAL_REQID_MAX; + + start = reqid; + do { + ++reqid; + if (reqid == 0) + reqid = IPSEC_MANUAL_REQID_MAX+1; + if (xfrm_policy_walk(check_reqid, (void*)&reqid) != -EEXIST) + return reqid; + } while (reqid != start); + return 0; +} + +static int +parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq) +{ + struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr; + struct sockaddr_in *sin; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; +#endif + + if (xp->xfrm_nr >= XFRM_MAX_DEPTH) + return -ELOOP; + + if (rq->sadb_x_ipsecrequest_mode == 0) + return -EINVAL; + + t->id.proto = rq->sadb_x_ipsecrequest_proto; /* XXX check proto */ + t->mode = rq->sadb_x_ipsecrequest_mode-1; + if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE) + t->optional = 1; + else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) { + t->reqid = rq->sadb_x_ipsecrequest_reqid; + if (t->reqid > IPSEC_MANUAL_REQID_MAX) + t->reqid = 0; + if (!t->reqid && !(t->reqid = gen_reqid())) + return -ENOBUFS; + } + + /* addresses present only in tunnel mode */ + if (t->mode) { + switch (xp->family) { + case AF_INET: + sin = (void*)(rq+1); + if (sin->sin_family != AF_INET) + return -EINVAL; + t->saddr.a4 = sin->sin_addr.s_addr; + sin++; + if (sin->sin_family != AF_INET) + return -EINVAL; + t->id.daddr.a4 = sin->sin_addr.s_addr; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + sin6 = (void *)(rq+1); + if (sin6->sin6_family != AF_INET6) + return -EINVAL; + memcpy(t->saddr.a6, &sin6->sin6_addr, sizeof(struct in6_addr)); + sin6++; + if (sin6->sin6_family != AF_INET6) + return -EINVAL; + memcpy(t->id.daddr.a6, &sin6->sin6_addr, sizeof(struct in6_addr)); + break; +#endif + default: + return -EINVAL; + } + } + /* No way to set this via kame pfkey */ + t->aalgos = t->ealgos = t->calgos = ~0; + xp->xfrm_nr++; + return 0; +} + +static int +parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) +{ + int err; + int len = pol->sadb_x_policy_len*8 - sizeof(struct sadb_x_policy); + struct sadb_x_ipsecrequest *rq = (void*)(pol+1); + + while (len >= sizeof(struct sadb_x_ipsecrequest)) { + if ((err = parse_ipsecrequest(xp, rq)) < 0) + return err; + len -= rq->sadb_x_ipsecrequest_len; + rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len); + } + return 0; +} + +static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp) +{ + int sockaddr_size = pfkey_sockaddr_size(xp->family); + int socklen = (xp->family == AF_INET ? + sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6)); + + return sizeof(struct sadb_msg) + + (sizeof(struct sadb_lifetime) * 3) + + (sizeof(struct sadb_address) * 2) + + (sockaddr_size * 2) + + sizeof(struct sadb_x_policy) + + (xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) + + (socklen * 2))); +} + +static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp) +{ + struct sk_buff *skb; + int size; + + size = pfkey_xfrm_policy2msg_size(xp); + + skb = alloc_skb(size + 16, GFP_ATOMIC); + if (skb == NULL) + return ERR_PTR(-ENOBUFS); + + return skb; +} + +static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, int dir) +{ + struct sadb_msg *hdr; + struct sadb_address *addr; + struct sadb_lifetime *lifetime; + struct sadb_x_policy *pol; + struct sockaddr_in *sin; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; +#endif + int i; + int size; + int sockaddr_size = pfkey_sockaddr_size(xp->family); + int socklen = (xp->family == AF_INET ? + sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6)); + + size = pfkey_xfrm_policy2msg_size(xp); + + /* call should fill header later */ + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + memset(hdr, 0, size); /* XXX do we need this ? */ + + /* src address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; + addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); + addr->sadb_address_prefixlen = xp->selector.prefixlen_s; + addr->sadb_address_reserved = 0; + /* src address */ + if (xp->family == AF_INET) { + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = xp->selector.saddr.a4; + sin->sin_port = xp->selector.sport; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (xp->family == AF_INET6) { + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = xp->selector.sport; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, xp->selector.saddr.a6, + sizeof(struct in6_addr));; + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + /* dst address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; + addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); + addr->sadb_address_prefixlen = xp->selector.prefixlen_d; + addr->sadb_address_reserved = 0; + if (xp->family == AF_INET) { + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = xp->selector.daddr.a4; + sin->sin_port = xp->selector.dport; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (xp->family == AF_INET6) { + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = xp->selector.dport; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, xp->selector.daddr.a6, + sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + /* hard time */ + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; + lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.hard_packet_limit); + lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.hard_byte_limit); + lifetime->sadb_lifetime_addtime = xp->lft.hard_add_expires_seconds; + lifetime->sadb_lifetime_usetime = xp->lft.hard_use_expires_seconds; + /* soft time */ + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; + lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.soft_packet_limit); + lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.soft_byte_limit); + lifetime->sadb_lifetime_addtime = xp->lft.soft_add_expires_seconds; + lifetime->sadb_lifetime_usetime = xp->lft.soft_use_expires_seconds; + /* current time */ + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; + lifetime->sadb_lifetime_allocations = xp->curlft.packets; + lifetime->sadb_lifetime_bytes = xp->curlft.bytes; + lifetime->sadb_lifetime_addtime = xp->curlft.add_time; + lifetime->sadb_lifetime_usetime = xp->curlft.use_time; + + pol = (struct sadb_x_policy *) skb_put(skb, sizeof(struct sadb_x_policy)); + pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); + pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; + pol->sadb_x_policy_type = IPSEC_POLICY_DISCARD; + if (xp->action == XFRM_POLICY_ALLOW) { + if (xp->xfrm_nr) + pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; + else + pol->sadb_x_policy_type = IPSEC_POLICY_NONE; + } + pol->sadb_x_policy_dir = dir+1; + pol->sadb_x_policy_id = xp->index; + + for (i=0; ixfrm_nr; i++) { + struct sadb_x_ipsecrequest *rq; + struct xfrm_tmpl *t = xp->xfrm_vec + i; + int req_size; + + req_size = sizeof(struct sadb_x_ipsecrequest); + if (t->mode) + req_size += 2*socklen; + else + size -= 2*socklen; + rq = (void*)skb_put(skb, req_size); + pol->sadb_x_policy_len += req_size/8; + memset(rq, 0, sizeof(*rq)); + rq->sadb_x_ipsecrequest_len = req_size; + rq->sadb_x_ipsecrequest_proto = t->id.proto; + rq->sadb_x_ipsecrequest_mode = t->mode+1; + rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_REQUIRE; + if (t->reqid) + rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_UNIQUE; + if (t->optional) + rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_USE; + rq->sadb_x_ipsecrequest_reqid = t->reqid; + if (t->mode) { + switch (xp->family) { + case AF_INET: + sin = (void*)(rq+1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = t->saddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + sin++; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = t->id.daddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + sin6 = (void*)(rq+1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, t->saddr.a6, + sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + + sin6++; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, t->id.daddr.a6, + sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + break; +#endif + default: + break; + } + } + } + hdr->sadb_msg_len = size / sizeof(uint64_t); + hdr->sadb_msg_reserved = atomic_read(&xp->refcnt); +} + +static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + int err; + struct sadb_lifetime *lifetime; + struct sadb_address *sa; + struct sadb_x_policy *pol; + struct xfrm_policy *xp; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + + if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || + !ext_hdrs[SADB_X_EXT_POLICY-1]) + return -EINVAL; + + pol = ext_hdrs[SADB_X_EXT_POLICY-1]; + if (pol->sadb_x_policy_type > IPSEC_POLICY_IPSEC) + return -EINVAL; + if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) + return -EINVAL; + + xp = xfrm_policy_alloc(GFP_KERNEL); + if (xp == NULL) + return -ENOBUFS; + + xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? + XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); + + sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr); + if (!xp->family) { + err = -EINVAL; + goto out; + } + xp->selector.family = xp->family; + xp->selector.prefixlen_s = sa->sadb_address_prefixlen; + xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + xp->selector.sport = ((struct sockaddr_in *)(sa+1))->sin_port; + if (xp->selector.sport) + xp->selector.sport_mask = ~0; + + sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1], + pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.daddr); + xp->selector.prefixlen_d = sa->sadb_address_prefixlen; + + /* Amusing, we set this twice. KAME apps appear to set same value + * in both addresses. + */ + xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + + xp->selector.dport = ((struct sockaddr_in *)(sa+1))->sin_port; + if (xp->selector.dport) + xp->selector.dport_mask = ~0; + + xp->lft.soft_byte_limit = XFRM_INF; + xp->lft.hard_byte_limit = XFRM_INF; + xp->lft.soft_packet_limit = XFRM_INF; + xp->lft.hard_packet_limit = XFRM_INF; + if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD-1]) != NULL) { + xp->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); + xp->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); + xp->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; + xp->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; + } + if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) != NULL) { + xp->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); + xp->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); + xp->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; + xp->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; + } + xp->xfrm_nr = 0; + if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && + (err = parse_ipsecrequests(xp, pol)) < 0) + goto out; + + out_skb = pfkey_xfrm_policy2msg_prep(xp); + if (IS_ERR(out_skb)) { + err = PTR_ERR(out_skb); + goto out; + } + + err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, + hdr->sadb_msg_type != SADB_X_SPDUPDATE); + if (err) { + kfree_skb(out_skb); + goto out; + } + + pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1); + + xfrm_pol_put(xp); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = hdr->sadb_msg_type; + out_hdr->sadb_msg_satype = 0; + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk); + return 0; + +out: + kfree(xp); + return err; +} + +static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + int err; + struct sadb_address *sa; + struct sadb_x_policy *pol; + struct xfrm_policy *xp; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + struct xfrm_selector sel; + + if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || + !ext_hdrs[SADB_X_EXT_POLICY-1]) + return -EINVAL; + + pol = ext_hdrs[SADB_X_EXT_POLICY-1]; + if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) + return -EINVAL; + + memset(&sel, 0, sizeof(sel)); + + sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); + sel.prefixlen_s = sa->sadb_address_prefixlen; + sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + sel.sport = ((struct sockaddr_in *)(sa+1))->sin_port; + if (sel.sport) + sel.sport_mask = ~0; + + sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1], + pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); + sel.prefixlen_d = sa->sadb_address_prefixlen; + sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + sel.dport = ((struct sockaddr_in *)(sa+1))->sin_port; + if (sel.dport) + sel.dport_mask = ~0; + + xp = xfrm_policy_bysel(pol->sadb_x_policy_dir-1, &sel, 1); + if (xp == NULL) + return -ENOENT; + + err = 0; + + out_skb = pfkey_xfrm_policy2msg_prep(xp); + if (IS_ERR(out_skb)) { + err = PTR_ERR(out_skb); + goto out; + } + pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = SADB_X_SPDDELETE; + out_hdr->sadb_msg_satype = 0; + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk); + err = 0; + +out: + xfrm_pol_put(xp); + return err; +} + +static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + int err; + struct sadb_x_policy *pol; + struct xfrm_policy *xp; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + + if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL) + return -EINVAL; + + xp = xfrm_policy_byid(0, pol->sadb_x_policy_id, + hdr->sadb_msg_type == SADB_X_SPDDELETE2); + if (xp == NULL) + return -ENOENT; + + err = 0; + + out_skb = pfkey_xfrm_policy2msg_prep(xp); + if (IS_ERR(out_skb)) { + err = PTR_ERR(out_skb); + goto out; + } + pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = hdr->sadb_msg_type; + out_hdr->sadb_msg_satype = 0; + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk); + err = 0; + +out: + xfrm_pol_put(xp); + return err; +} + +static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) +{ + struct pfkey_dump_data *data = ptr; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + + out_skb = pfkey_xfrm_policy2msg_prep(xp); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); + + pfkey_xfrm_policy2msg(out_skb, xp, dir); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = data->hdr->sadb_msg_version; + out_hdr->sadb_msg_type = SADB_X_SPDDUMP; + out_hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_seq = count; + out_hdr->sadb_msg_pid = data->hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, data->sk); + return 0; +} + +static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct pfkey_dump_data data = { .skb = skb, .hdr = hdr, .sk = sk }; + + return xfrm_policy_walk(dump_sp, &data); +} + +static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct sk_buff *skb_out; + struct sadb_msg *hdr_out; + + skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); + if (!skb_out) + return -ENOBUFS; + + xfrm_policy_flush(); + + hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg)); + pfkey_hdr_dup(hdr_out, hdr); + hdr_out->sadb_msg_errno = (uint8_t) 0; + hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); + pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL); + + return 0; +} + +typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb, + struct sadb_msg *hdr, void **ext_hdrs); +static pfkey_handler pfkey_funcs[SADB_MAX + 1] = { + [SADB_RESERVED] = pfkey_reserved, + [SADB_GETSPI] = pfkey_getspi, + [SADB_UPDATE] = pfkey_add, + [SADB_ADD] = pfkey_add, + [SADB_DELETE] = pfkey_delete, + [SADB_GET] = pfkey_get, + [SADB_ACQUIRE] = pfkey_acquire, + [SADB_REGISTER] = pfkey_register, + [SADB_EXPIRE] = NULL, + [SADB_FLUSH] = pfkey_flush, + [SADB_DUMP] = pfkey_dump, + [SADB_X_PROMISC] = pfkey_promisc, + [SADB_X_PCHANGE] = NULL, + [SADB_X_SPDUPDATE] = pfkey_spdadd, + [SADB_X_SPDADD] = pfkey_spdadd, + [SADB_X_SPDDELETE] = pfkey_spddelete, + [SADB_X_SPDGET] = pfkey_spdget, + [SADB_X_SPDACQUIRE] = NULL, + [SADB_X_SPDDUMP] = pfkey_spddump, + [SADB_X_SPDFLUSH] = pfkey_spdflush, + [SADB_X_SPDSETIDX] = pfkey_spdadd, + [SADB_X_SPDDELETE2] = pfkey_spdget, +}; + +static int pfkey_process(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr) +{ + void *ext_hdrs[SADB_EXT_MAX]; + int err; + + pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, + BROADCAST_PROMISC_ONLY, NULL); + + memset(ext_hdrs, 0, sizeof(ext_hdrs)); + err = parse_exthdrs(skb, hdr, ext_hdrs); + if (!err) { + err = -EOPNOTSUPP; + if (pfkey_funcs[hdr->sadb_msg_type]) + err = pfkey_funcs[hdr->sadb_msg_type](sk, skb, hdr, ext_hdrs); + } + return err; +} + +static struct sadb_msg *pfkey_get_base_msg(struct sk_buff *skb, int *errp) +{ + struct sadb_msg *hdr = NULL; + + if (skb->len < sizeof(*hdr)) { + *errp = -EMSGSIZE; + } else { + hdr = (struct sadb_msg *) skb->data; + if (hdr->sadb_msg_version != PF_KEY_V2 || + hdr->sadb_msg_reserved != 0 || + (hdr->sadb_msg_type <= SADB_RESERVED || + hdr->sadb_msg_type > SADB_MAX)) { + hdr = NULL; + *errp = -EINVAL; + } else if (hdr->sadb_msg_len != (skb->len / + sizeof(uint64_t)) || + hdr->sadb_msg_len < (sizeof(struct sadb_msg) / + sizeof(uint64_t))) { + hdr = NULL; + *errp = -EMSGSIZE; + } else { + *errp = 0; + } + } + return hdr; +} + +static inline int aalg_tmpl_set(struct xfrm_tmpl *t, struct xfrm_algo_desc *d) +{ + return t->aalgos & (1 << d->desc.sadb_alg_id); +} + +static inline int ealg_tmpl_set(struct xfrm_tmpl *t, struct xfrm_algo_desc *d) +{ + return t->ealgos & (1 << d->desc.sadb_alg_id); +} + +static int count_ah_combs(struct xfrm_tmpl *t) +{ + int i, sz = 0; + + for (i = 0; ; i++) { + struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); + if (!aalg) + break; + if (aalg_tmpl_set(t, aalg) && aalg->available) + sz += sizeof(struct sadb_comb); + } + return sz + sizeof(struct sadb_prop); +} + +static int count_esp_combs(struct xfrm_tmpl *t) +{ + int i, k, sz = 0; + + for (i = 0; ; i++) { + struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); + if (!ealg) + break; + + if (!(ealg_tmpl_set(t, ealg) && ealg->available)) + continue; + + for (k = 1; ; k++) { + struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); + if (!aalg) + break; + + if (aalg_tmpl_set(t, aalg) && aalg->available) + sz += sizeof(struct sadb_comb); + } + } + return sz + sizeof(struct sadb_prop); +} + +static void dump_ah_combs(struct sk_buff *skb, struct xfrm_tmpl *t) +{ + struct sadb_prop *p; + int i; + + p = (struct sadb_prop*)skb_put(skb, sizeof(struct sadb_prop)); + p->sadb_prop_len = sizeof(struct sadb_prop)/8; + p->sadb_prop_exttype = SADB_EXT_PROPOSAL; + p->sadb_prop_replay = 32; + memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); + + for (i = 0; ; i++) { + struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); + if (!aalg) + break; + + if (aalg_tmpl_set(t, aalg) && aalg->available) { + struct sadb_comb *c; + c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb)); + memset(c, 0, sizeof(*c)); + p->sadb_prop_len += sizeof(struct sadb_comb)/8; + c->sadb_comb_auth = aalg->desc.sadb_alg_id; + c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; + c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; + c->sadb_comb_hard_addtime = 24*60*60; + c->sadb_comb_soft_addtime = 20*60*60; + c->sadb_comb_hard_usetime = 8*60*60; + c->sadb_comb_soft_usetime = 7*60*60; + } + } +} + +static void dump_esp_combs(struct sk_buff *skb, struct xfrm_tmpl *t) +{ + struct sadb_prop *p; + int i, k; + + p = (struct sadb_prop*)skb_put(skb, sizeof(struct sadb_prop)); + p->sadb_prop_len = sizeof(struct sadb_prop)/8; + p->sadb_prop_exttype = SADB_EXT_PROPOSAL; + p->sadb_prop_replay = 32; + memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); + + for (i=0; ; i++) { + struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); + if (!ealg) + break; + + if (!(ealg_tmpl_set(t, ealg) && ealg->available)) + continue; + + for (k = 1; ; k++) { + struct sadb_comb *c; + struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); + if (!aalg) + break; + if (!(aalg_tmpl_set(t, aalg) && aalg->available)) + continue; + c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb)); + memset(c, 0, sizeof(*c)); + p->sadb_prop_len += sizeof(struct sadb_comb)/8; + c->sadb_comb_auth = aalg->desc.sadb_alg_id; + c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; + c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; + c->sadb_comb_encrypt = ealg->desc.sadb_alg_id; + c->sadb_comb_encrypt_minbits = ealg->desc.sadb_alg_minbits; + c->sadb_comb_encrypt_maxbits = ealg->desc.sadb_alg_maxbits; + c->sadb_comb_hard_addtime = 24*60*60; + c->sadb_comb_soft_addtime = 20*60*60; + c->sadb_comb_hard_usetime = 8*60*60; + c->sadb_comb_soft_usetime = 7*60*60; + } + } +} + +static int pfkey_send_notify(struct xfrm_state *x, int hard) +{ + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + int hsc = (hard ? 2 : 1); + + out_skb = pfkey_xfrm_state2msg(x, 0, hsc); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = PF_KEY_V2; + out_hdr->sadb_msg_type = SADB_EXPIRE; + out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = 0; + out_hdr->sadb_msg_pid = 0; + + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL); + return 0; +} + +static u32 get_acqseq(void) +{ + u32 res; + static u32 acqseq; + static spinlock_t acqseq_lock = SPIN_LOCK_UNLOCKED; + + spin_lock_bh(&acqseq_lock); + res = (++acqseq ? : ++acqseq); + spin_unlock_bh(&acqseq_lock); + return res; +} + +static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *xp, int dir) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + struct sadb_address *addr; + struct sadb_x_policy *pol; + struct sockaddr_in *sin; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; +#endif + int sockaddr_size; + int size; + + sockaddr_size = pfkey_sockaddr_size(x->props.family); + if (!sockaddr_size) + return -EINVAL; + + size = sizeof(struct sadb_msg) + + (sizeof(struct sadb_address) * 2) + + (sockaddr_size * 2) + + sizeof(struct sadb_x_policy); + + if (x->id.proto == IPPROTO_AH) + size += count_ah_combs(t); + else if (x->id.proto == IPPROTO_ESP) + size += count_esp_combs(t); + + skb = alloc_skb(size + 16, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + hdr->sadb_msg_version = PF_KEY_V2; + hdr->sadb_msg_type = SADB_ACQUIRE; + hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); + hdr->sadb_msg_len = size / sizeof(uint64_t); + hdr->sadb_msg_errno = 0; + hdr->sadb_msg_reserved = 0; + hdr->sadb_msg_seq = x->km.seq = get_acqseq(); + hdr->sadb_msg_pid = 0; + + /* src address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + if (x->props.family == AF_INET) { + addr->sadb_address_prefixlen = 32; + + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = x->props.saddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (x->props.family == AF_INET6) { + addr->sadb_address_prefixlen = 128; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, + x->props.saddr.a6, sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + /* dst address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + if (x->props.family == AF_INET) { + addr->sadb_address_prefixlen = 32; + + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = x->id.daddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (x->props.family == AF_INET6) { + addr->sadb_address_prefixlen = 128; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, + x->id.daddr.a6, sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + pol = (struct sadb_x_policy *) skb_put(skb, sizeof(struct sadb_x_policy)); + pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); + pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; + pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; + pol->sadb_x_policy_dir = dir+1; + pol->sadb_x_policy_id = xp->index; + + /* Set sadb_comb's. */ + if (x->id.proto == IPPROTO_AH) + dump_ah_combs(skb, t); + else if (x->id.proto == IPPROTO_ESP) + dump_esp_combs(skb, t); + + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL); +} + +static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt, + u8 *data, int len, int *dir) +{ + struct xfrm_policy *xp; + struct sadb_x_policy *pol = (struct sadb_x_policy*)data; + + switch (family) { + case AF_INET: + if (opt != IP_IPSEC_POLICY) { + *dir = -EOPNOTSUPP; + return NULL; + } + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + if (opt != IPV6_IPSEC_POLICY) { + *dir = -EOPNOTSUPP; + return NULL; + } + break; +#endif + default: + *dir = -EINVAL; + return NULL; + } + + *dir = -EINVAL; + + if (len < sizeof(struct sadb_x_policy) || + pol->sadb_x_policy_len*8 > len || + pol->sadb_x_policy_type > IPSEC_POLICY_BYPASS || + (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir > IPSEC_DIR_OUTBOUND)) + return NULL; + + xp = xfrm_policy_alloc(GFP_ATOMIC); + if (xp == NULL) { + *dir = -ENOBUFS; + return NULL; + } + + xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? + XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); + + xp->lft.soft_byte_limit = XFRM_INF; + xp->lft.hard_byte_limit = XFRM_INF; + xp->lft.soft_packet_limit = XFRM_INF; + xp->lft.hard_packet_limit = XFRM_INF; + xp->family = family; + + xp->xfrm_nr = 0; + if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && + (*dir = parse_ipsecrequests(xp, pol)) < 0) + goto out; + + *dir = pol->sadb_x_policy_dir-1; + return xp; + +out: + kfree(xp); + return NULL; +} + +static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + struct sadb_sa *sa; + struct sadb_address *addr; + struct sadb_x_nat_t_port *n_port; + struct sockaddr_in *sin; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; +#endif + int sockaddr_size; + int size; + __u8 satype = (x->id.proto == IPPROTO_ESP ? SADB_SATYPE_ESP : 0); + struct xfrm_encap_tmpl *natt = NULL; + + sockaddr_size = pfkey_sockaddr_size(x->props.family); + if (!sockaddr_size) + return -EINVAL; + + if (!satype) + return -EINVAL; + + if (!x->encap) + return -EINVAL; + + natt = x->encap; + + /* Build an SADB_X_NAT_T_NEW_MAPPING message: + * + * HDR | SA | ADDRESS_SRC (old addr) | NAT_T_SPORT (old port) | + * ADDRESS_DST (new addr) | NAT_T_DPORT (new port) + */ + + size = sizeof(struct sadb_msg) + + sizeof(struct sadb_sa) + + (sizeof(struct sadb_address) * 2) + + (sockaddr_size * 2) + + (sizeof(struct sadb_x_nat_t_port) * 2); + + skb = alloc_skb(size + 16, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + hdr->sadb_msg_version = PF_KEY_V2; + hdr->sadb_msg_type = SADB_X_NAT_T_NEW_MAPPING; + hdr->sadb_msg_satype = satype; + hdr->sadb_msg_len = size / sizeof(uint64_t); + hdr->sadb_msg_errno = 0; + hdr->sadb_msg_reserved = 0; + hdr->sadb_msg_seq = x->km.seq = get_acqseq(); + hdr->sadb_msg_pid = 0; + + /* SA */ + sa = (struct sadb_sa *) skb_put(skb, sizeof(struct sadb_sa)); + sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); + sa->sadb_sa_exttype = SADB_EXT_SA; + sa->sadb_sa_spi = x->id.spi; + sa->sadb_sa_replay = 0; + sa->sadb_sa_state = 0; + sa->sadb_sa_auth = 0; + sa->sadb_sa_encrypt = 0; + sa->sadb_sa_flags = 0; + + /* ADDRESS_SRC (old addr) */ + addr = (struct sadb_address*) + skb_put(skb, sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + if (x->props.family == AF_INET) { + addr->sadb_address_prefixlen = 32; + + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = x->props.saddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (x->props.family == AF_INET6) { + addr->sadb_address_prefixlen = 128; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, + x->props.saddr.a6, sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + /* NAT_T_SPORT (old port) */ + n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port)); + n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); + n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; + n_port->sadb_x_nat_t_port_port = natt->encap_sport; + n_port->sadb_x_nat_t_port_reserved = 0; + + /* ADDRESS_DST (new addr) */ + addr = (struct sadb_address*) + skb_put(skb, sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + if (x->props.family == AF_INET) { + addr->sadb_address_prefixlen = 32; + + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = ipaddr->a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (x->props.family == AF_INET6) { + addr->sadb_address_prefixlen = 128; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, &ipaddr->a6, sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + /* NAT_T_DPORT (new port) */ + n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port)); + n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); + n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; + n_port->sadb_x_nat_t_port_port = sport; + n_port->sadb_x_nat_t_port_reserved = 0; + + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL); +} + +static int pfkey_sendmsg(struct socket *sock, struct msghdr *msg, int len, + struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb = NULL; + struct sadb_msg *hdr = NULL; + int err; + + err = -EOPNOTSUPP; + if (msg->msg_flags & MSG_OOB) + goto out; + + err = -EMSGSIZE; + if ((unsigned)len > sk->sndbuf-32) + goto out; + + err = -ENOBUFS; + skb = alloc_skb(len, GFP_KERNEL); + if (skb == NULL) + goto out; + + err = -EFAULT; + if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) + goto out; + + hdr = pfkey_get_base_msg(skb, &err); + if (!hdr) + goto out; + + down(&xfrm_cfg_sem); + err = pfkey_process(sk, skb, hdr); + up(&xfrm_cfg_sem); + +out: + if (err && hdr && pfkey_error(hdr, err, sk) == 0) + err = 0; + if (skb) + kfree_skb(skb); + + return err ? : len; +} + +static int pfkey_recvmsg(struct socket *sock, struct msghdr *msg, int len, + int flags, struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int copied, err; + + err = -EINVAL; + if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC)) + goto out; + + msg->msg_namelen = 0; + skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + skb->h.raw = skb->data; + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free; + + sock_recv_timestamp(msg, sk, skb); + + err = (flags & MSG_TRUNC) ? skb->len : copied; + +out_free: + skb_free_datagram(sk, skb); +out: + return err; +} + +static struct proto_ops pfkey_ops = { + .family = PF_KEY, + + /* Operations that make no sense on pfkey sockets. */ + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, + + /* Now the operations that really occur. */ + .release = pfkey_release, + .poll = datagram_poll, + .sendmsg = pfkey_sendmsg, + .recvmsg = pfkey_recvmsg, +}; + +static struct net_proto_family pfkey_family_ops = { + .family = PF_KEY, + .create = pfkey_create, +}; + +#ifdef CONFIG_PROC_FS +static int pfkey_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos = 0; + off_t begin = 0; + int len = 0; + struct sock *s; + + len += sprintf(buffer,"sk RefCnt Rmem Wmem User Inode\n"); + + read_lock(&pfkey_table_lock); + + for (s = pfkey_table; s; s = s->next) { + len += sprintf(buffer+len,"%p %-6d %-6u %-6u %-6u %-6lu", + s, + atomic_read(&s->refcnt), + atomic_read(&s->rmem_alloc), + atomic_read(&s->wmem_alloc), + sock_i_uid(s), + sock_i_ino(s) + ); + + buffer[len++] = '\n'; + + pos = begin + len; + if (pos < offset) { + len = 0; + begin = pos; + } + if(pos > offset + length) + goto done; + } + *eof = 1; + +done: + read_unlock(&pfkey_table_lock); + + *start = buffer + (offset - begin); + len -= (offset - begin); + + if (len > length) + len = length; + if (len < 0) + len = 0; + + return len; +} +#endif + +static struct xfrm_mgr pfkeyv2_mgr = +{ + .id = "pfkeyv2", + .notify = pfkey_send_notify, + .acquire = pfkey_send_acquire, + .compile_policy = pfkey_compile_policy, + .new_mapping = pfkey_send_new_mapping, +}; + +static void __exit ipsec_pfkey_exit(void) +{ + xfrm_unregister_km(&pfkeyv2_mgr); + remove_proc_entry("net/pfkey", 0); + sock_unregister(PF_KEY); +} + +static int __init ipsec_pfkey_init(void) +{ + sock_register(&pfkey_family_ops); +#ifdef CONFIG_PROC_FS + create_proc_read_entry("net/pfkey", 0, 0, pfkey_read_proc, NULL); +#endif + xfrm_register_km(&pfkeyv2_mgr); + return 0; +} + +module_init(ipsec_pfkey_init); +module_exit(ipsec_pfkey_exit); +MODULE_LICENSE("GPL"); Index: net/netlink/af_netlink.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/netlink/af_netlink.c,v retrieving revision 1.1.1.21 retrieving revision 1.1.1.21.2.1 diff -u -r1.1.1.21 -r1.1.1.21.2.1 --- a/net/netlink/af_netlink.c 18 Feb 2004 13:36:32 -0000 1.1.1.21 +++ b/net/netlink/af_netlink.c 16 Apr 2004 13:16:26 -0000 1.1.1.21.2.1 @@ -496,13 +496,13 @@ return -1; } -void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, - u32 group, int allocation) +int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, + u32 group, int allocation) { struct sock *sk; struct sk_buff *skb2 = NULL; int protocol = ssk->protocol; - int failure = 0; + int failure = 0, delivered = 0; /* While we sleep in clone, do not allow to change socket list */ @@ -536,8 +536,10 @@ failure = 1; } else if (netlink_broadcast_deliver(sk, skb2)) { netlink_overrun(sk); - } else + } else { + delivered = 1; skb2 = NULL; + } sock_put(sk); } @@ -546,6 +548,12 @@ if (skb2) kfree_skb(skb2); kfree_skb(skb); + + if (delivered) + return 0; + if (failure) + return -ENOBUFS; + return -ESRCH; } void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) Index: net/sched/cls_route.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/sched/cls_route.c,v retrieving revision 1.1.1.16 retrieving revision 1.1.1.16.2.1 diff -u -r1.1.1.16 -r1.1.1.16.2.1 --- a/net/sched/cls_route.c 21 Dec 2001 17:42:06 -0000 1.1.1.16 +++ b/net/sched/cls_route.c 16 Apr 2004 13:16:27 -0000 1.1.1.16.2.1 @@ -154,7 +154,7 @@ if (head == NULL) goto old_method; - iif = ((struct rtable*)dst)->key.iif; + iif = ((struct rtable*)dst)->fl.iif; h = route4_fastmap_hash(id, iif); if (id == head->fastmap[h].id && Index: net/sctp/input.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/sctp/input.c,v retrieving revision 1.1.1.7 retrieving revision 1.1.1.7.2.1 diff -u -r1.1.1.7 -r1.1.1.7.2.1 --- a/net/sctp/input.c 14 Apr 2004 13:05:41 -0000 1.1.1.7 +++ b/net/sctp/input.c 16 Apr 2004 13:16:27 -0000 1.1.1.7.2.1 @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -175,7 +176,7 @@ rcvr = asoc ? &asoc->base : &ep->base; sk = rcvr->sk; - if (!ipsec_sk_policy(sk, skb)) + if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family)) goto discard_release; ret = sk_filter(sk, skb, 1); Index: net/sctp/ipv6.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/sctp/ipv6.c,v retrieving revision 1.1.1.9 retrieving revision 1.1.1.9.2.2 diff -u -r1.1.1.9 -r1.1.1.9.2.2 --- a/net/sctp/ipv6.c 14 Apr 2004 13:05:41 -0000 1.1.1.9 +++ b/net/sctp/ipv6.c 16 Apr 2004 23:32:09 -0000 1.1.1.9.2.2 @@ -82,14 +82,14 @@ /* FIXME: This macro needs to be moved to a common header file. */ #define NIP6(addr) \ - ntohs((addr)->s6_addr16[0]), \ - ntohs((addr)->s6_addr16[1]), \ - ntohs((addr)->s6_addr16[2]), \ - ntohs((addr)->s6_addr16[3]), \ - ntohs((addr)->s6_addr16[4]), \ - ntohs((addr)->s6_addr16[5]), \ - ntohs((addr)->s6_addr16[6]), \ - ntohs((addr)->s6_addr16[7]) + ntohs((addr).s6_addr16[0]), \ + ntohs((addr).s6_addr16[1]), \ + ntohs((addr).s6_addr16[2]), \ + ntohs((addr).s6_addr16[3]), \ + ntohs((addr).s6_addr16[4]), \ + ntohs((addr).s6_addr16[5]), \ + ntohs((addr).s6_addr16[6]), \ + ntohs((addr).s6_addr16[7]) /* ICMP error handler. */ void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, @@ -165,12 +165,12 @@ /* Fill in the dest address from the route entry passed with the skb * and the source address from the transport. */ - fl.fl6_dst = &transport->ipaddr.v6.sin6_addr; - fl.fl6_src = &transport->saddr.v6.sin6_addr; + ipv6_addr_copy(&fl.fl6_dst, &transport->ipaddr.v6.sin6_addr); + ipv6_addr_copy(&fl.fl6_src, &transport->saddr.v6.sin6_addr); fl.fl6_flowlabel = np->flow_label; IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); - if (ipv6_addr_type(fl.fl6_src) & IPV6_ADDR_LINKLOCAL) + if (ipv6_addr_type(&fl.fl6_src) & IPV6_ADDR_LINKLOCAL) fl.oif = transport->saddr.v6.sin6_scope_id; else fl.oif = sk->sk_bound_dev_if; @@ -179,7 +179,7 @@ if (np->opt && np->opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - fl.fl6_dst = rt0->addr; + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); } SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, " @@ -204,7 +204,7 @@ struct flowi fl; memset(&fl, 0, sizeof(fl)); - fl.fl6_dst = &daddr->v6.sin6_addr; + ipv6_addr_copy(&fl.fl6_dst, &daddr->v6.sin6_addr); if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) fl.oif = daddr->v6.sin6_scope_id; @@ -213,7 +213,7 @@ __FUNCTION__, NIP6(fl.fl6_dst)); if (saddr) { - fl.fl6_src = &saddr->v6.sin6_addr; + ipv6_addr_copy(&fl.fl6_src, &saddr->v6.sin6_addr); SCTP_DEBUG_PRINTK( "SRC=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x - ", NIP6(fl.fl6_src)); @@ -226,7 +226,7 @@ SCTP_DEBUG_PRINTK( "rt6_dst:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x " "rt6_src:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", - NIP6(&rt->rt6i_dst.addr), NIP6(&rt->rt6i_src.addr)); + NIP6(rt->rt6i_dst.addr), NIP6(rt->rt6i_src.addr)); } else { SCTP_DEBUG_PRINTK("NO ROUTE\n"); } @@ -273,13 +273,13 @@ SCTP_DEBUG_PRINTK("%s: asoc:%p dst:%p " "daddr:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", - __FUNCTION__, asoc, dst, NIP6(&daddr->v6.sin6_addr)); + __FUNCTION__, asoc, dst, NIP6(daddr->v6.sin6_addr)); if (!asoc) { ipv6_get_saddr(dst, &daddr->v6.sin6_addr,&saddr->v6.sin6_addr); SCTP_DEBUG_PRINTK("saddr from ipv6_get_saddr: " "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", - NIP6(&saddr->v6.sin6_addr)); + NIP6(saddr->v6.sin6_addr)); return; } @@ -308,12 +308,12 @@ memcpy(saddr, baddr, sizeof(union sctp_addr)); SCTP_DEBUG_PRINTK("saddr: " "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", - NIP6(&saddr->v6.sin6_addr)); + NIP6(saddr->v6.sin6_addr)); } else { printk(KERN_ERR "%s: asoc:%p Could not find a valid source " "address for the " "dest:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", - __FUNCTION__, asoc, NIP6(&daddr->v6.sin6_addr)); + __FUNCTION__, asoc, NIP6(daddr->v6.sin6_addr)); } sctp_read_unlock(addr_lock); @@ -629,7 +629,7 @@ /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. */ - newinet->ttl = sysctl_ip_default_ttl; + newinet->uc_ttl = -1; newinet->mc_loop = 1; newinet->mc_ttl = 1; newinet->mc_index = 0; @@ -678,7 +678,7 @@ static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) { seq_printf(seq, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", - NIP6(&addr->v6.sin6_addr)); + NIP6(addr->v6.sin6_addr)); } /* Initialize a PF_INET6 socket msg_name. */ @@ -912,14 +912,15 @@ .flags = SCTP_PROTOSW_FLAG, }; +static int sctp6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + return sctp_rcv(*pskb) ? -1 : 0; +} + static struct inet6_protocol sctpv6_protocol = { - .handler = sctp_rcv, + .handler = sctp6_rcv, .err_handler = sctp_v6_err, - .next = NULL, - .protocol = IPPROTO_SCTP, - .copy = 0, - .data = NULL, - .name = "SCTPv6", + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; static struct sctp_af sctp_ipv6_specific = { @@ -967,7 +968,8 @@ int sctp_v6_init(void) { /* Register inet6 protocol. */ - inet6_add_protocol(&sctpv6_protocol); + if (inet6_add_protocol(&sctpv6_protocol, IPPROTO_SCTP) < 0) + return -EAGAIN; /* Add SCTPv6(UDP and TCP style) to inetsw6 linked list. */ inet6_register_protosw(&sctpv6_seqpacket_protosw); @@ -989,7 +991,7 @@ void sctp_v6_exit(void) { list_del(&sctp_ipv6_specific.list); - inet6_del_protocol(&sctpv6_protocol); + inet6_del_protocol(&sctpv6_protocol, IPPROTO_SCTP); inet6_unregister_protosw(&sctpv6_seqpacket_protosw); inet6_unregister_protosw(&sctpv6_stream_protosw); unregister_inet6addr_notifier(&sctp_inetaddr_notifier); Index: net/sctp/protocol.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/net/sctp/protocol.c,v retrieving revision 1.1.1.7 retrieving revision 1.1.1.7.2.1 diff -u -r1.1.1.7 -r1.1.1.7.2.1 --- a/net/sctp/protocol.c 14 Apr 2004 13:05:41 -0000 1.1.1.7 +++ b/net/sctp/protocol.c 16 Apr 2004 13:16:27 -0000 1.1.1.7.2.1 @@ -433,7 +433,7 @@ union sctp_addr *saddr) { struct rtable *rt; - struct rt_key key; + struct flowi fl; struct sctp_bind_addr *bp; rwlock_t *addr_lock; struct sctp_sockaddr_entry *laddr; @@ -441,21 +441,21 @@ struct dst_entry *dst = NULL; union sctp_addr dst_saddr; - memset(&key, 0x0, sizeof(struct rt_key)); - key.dst = daddr->v4.sin_addr.s_addr; - + memset(&fl, 0x0, sizeof(struct flowi)); + fl.fl4_dst = daddr->v4.sin_addr.s_addr; + fl.proto = IPPROTO_SCTP; if (asoc) { - key.tos = RT_CONN_FLAGS(asoc->base.sk); - key.oif = asoc->base.sk->bound_dev_if; + fl.fl4_tos = RT_CONN_FLAGS(asoc->base.sk); + fl.oif = asoc->base.sk->bound_dev_if; } if (saddr) - key.src = saddr->v4.sin_addr.s_addr; + fl.fl4_src = saddr->v4.sin_addr.s_addr; SCTP_DEBUG_PRINTK("%s: DST:%u.%u.%u.%u, SRC:%u.%u.%u.%u - ", - __FUNCTION__, NIPQUAD(key.dst), - NIPQUAD(key.src)); + __FUNCTION__, NIPQUAD(fl.fl4_dst), + NIPQUAD(fl.fl4_src)); - if (!ip_route_output_key(&rt, &key)) { + if (!ip_route_output_key(&rt, &fl)) { dst = &rt->u.dst; } @@ -497,8 +497,8 @@ laddr = list_entry(pos, struct sctp_sockaddr_entry, list); if (AF_INET == laddr->a.sa.sa_family) { - key.src = laddr->a.v4.sin_addr.s_addr; - if (!ip_route_output_key(&rt, &key)) { + fl.fl4_src = laddr->a.v4.sin_addr.s_addr; + if (!ip_route_output_key(&rt, &fl)) { dst = &rt->u.dst; goto out_unlock; } @@ -587,7 +587,7 @@ newinet->pmtudisc = inet->pmtudisc; newinet->id = 0; - newinet->ttl = sysctl_ip_default_ttl; + newinet->uc_ttl = -1; newinet->mc_loop = 1; newinet->mc_ttl = 1; newinet->mc_index = 0; @@ -656,7 +656,7 @@ return err; } sctp_ctl_socket->sk->sk_allocation = GFP_ATOMIC; - inet_sk(sctp_ctl_socket->sk)->ttl = MAXTTL; + inet_sk(sctp_ctl_socket->sk)->uc_ttl = -1; return 0; } @@ -872,8 +872,7 @@ static struct inet_protocol sctp_protocol = { .handler = sctp_rcv, .err_handler = sctp_v4_err, - .protocol = IPPROTO_SCTP, - .name = "SCTP" + .no_policy = 1, }; /* IPv4 address related functions. */ @@ -960,7 +959,8 @@ return -EINVAL; /* Add SCTP to inet_protos hash table. */ - inet_add_protocol(&sctp_protocol); + if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0) + return -EAGAIN; /* Add SCTP(TCP and UDP style) to inetsw linked list. */ inet_register_protosw(&sctp_seqpacket_protosw); @@ -1148,7 +1148,7 @@ err_init_mibs: kmem_cache_destroy(sctp_chunk_cachep); err_chunk_cachep: - inet_del_protocol(&sctp_protocol); + inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); inet_unregister_protosw(&sctp_seqpacket_protosw); inet_unregister_protosw(&sctp_stream_protosw); return status; @@ -1188,7 +1188,7 @@ sctp_proc_exit(); cleanup_sctp_mibs(); - inet_del_protocol(&sctp_protocol); + inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); inet_unregister_protosw(&sctp_seqpacket_protosw); inet_unregister_protosw(&sctp_stream_protosw); } Index: net/xfrm/Config.in =================================================================== RCS file: net/xfrm/Config.in diff -N net/xfrm/Config.in --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/xfrm/Config.in 16 Apr 2004 13:16:27 -0000 1.2.18.1 @@ -0,0 +1,4 @@ +# +# XFRM configuration +# +tristate ' IP: IPsec user configuration interface' CONFIG_XFRM_USER Index: net/xfrm/Makefile =================================================================== RCS file: net/xfrm/Makefile diff -N net/xfrm/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/xfrm/Makefile 16 Apr 2004 13:16:27 -0000 1.3.18.1 @@ -0,0 +1,13 @@ +# +# Makefile for the XFRM subsystem. +# + +O_TARGET := xfrm.o + +export-objs = xfrm_export.o + +obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_input.o xfrm_algo.o xfrm_output.o \ + xfrm_export.o +obj-$(CONFIG_XFRM_USER) += xfrm_user.o + +include $(TOPDIR)/Rules.make Index: net/xfrm/xfrm_algo.c =================================================================== RCS file: net/xfrm/xfrm_algo.c diff -N net/xfrm/xfrm_algo.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/xfrm/xfrm_algo.c 16 Apr 2004 13:16:27 -0000 1.5.2.1 @@ -0,0 +1,729 @@ +/* + * xfrm algorithm interface + * + * Copyright (c) 2002 James Morris + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ +#include +#include +#include +#include +#if defined(CONFIG_INET_AH) || defined(CONFIG_INET_AH_MODULE) || defined(CONFIG_INET6_AH) || defined(CONFIG_INET6_AH_MODULE) +#include +#endif +#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE) +#include +#endif +#include + +/* + * Algorithms supported by IPsec. These entries contain properties which + * are used in key negotiation and xfrm processing, and are used to verify + * that instantiated crypto transforms have correct parameters for IPsec + * purposes. + */ +static struct xfrm_algo_desc aalg_list[] = { +{ + .name = "digest_null", + + .uinfo = { + .auth = { + .icv_truncbits = 0, + .icv_fullbits = 0, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_AALG_NULL, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 0, + .sadb_alg_maxbits = 0 + } +}, +{ + .name = "md5", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_AALG_MD5HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 128 + } +}, +{ + .name = "sha1", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 160, + } + }, + + .desc = { + .sadb_alg_id = SADB_AALG_SHA1HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 160, + .sadb_alg_maxbits = 160 + } +}, +{ + .name = "sha256", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 256, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_AALG_SHA2_256HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 256, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "ripemd160", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 160, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_AALG_RIPEMD160HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 160, + .sadb_alg_maxbits = 160 + } +}, +}; + +static struct xfrm_algo_desc ealg_list[] = { +{ + .name = "cipher_null", + + .uinfo = { + .encr = { + .blockbits = 8, + .defkeybits = 0, + } + }, + + .desc = { + .sadb_alg_id = SADB_EALG_NULL, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 0, + .sadb_alg_maxbits = 0 + } +}, +{ + .name = "des", + + .uinfo = { + .encr = { + .blockbits = 64, + .defkeybits = 64, + } + }, + + .desc = { + .sadb_alg_id = SADB_EALG_DESCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 64, + .sadb_alg_maxbits = 64 + } +}, +{ + .name = "des3_ede", + + .uinfo = { + .encr = { + .blockbits = 64, + .defkeybits = 192, + } + }, + + .desc = { + .sadb_alg_id = SADB_EALG_3DESCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 192, + .sadb_alg_maxbits = 192 + } +}, +{ + .name = "cast128", + + .uinfo = { + .encr = { + .blockbits = 64, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_CASTCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 40, + .sadb_alg_maxbits = 128 + } +}, +{ + .name = "blowfish", + + .uinfo = { + .encr = { + .blockbits = 64, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_BLOWFISHCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 40, + .sadb_alg_maxbits = 448 + } +}, +{ + .name = "aes", + + .uinfo = { + .encr = { + .blockbits = 128, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_AESCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "serpent", + + .uinfo = { + .encr = { + .blockbits = 128, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_SERPENTCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256, + } +}, +{ + .name = "twofish", + + .uinfo = { + .encr = { + .blockbits = 128, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_TWOFISHCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +}; + +static struct xfrm_algo_desc calg_list[] = { +{ + .name = "deflate", + .uinfo = { + .comp = { + .threshold = 90, + } + }, + .desc = { .sadb_alg_id = SADB_X_CALG_DEFLATE } +}, +{ + .name = "lzs", + .uinfo = { + .comp = { + .threshold = 90, + } + }, + .desc = { .sadb_alg_id = SADB_X_CALG_LZS } +}, +{ + .name = "lzjh", + .uinfo = { + .comp = { + .threshold = 50, + } + }, + .desc = { .sadb_alg_id = SADB_X_CALG_LZJH } +}, +}; + +static inline int aalg_entries(void) +{ + return sizeof(aalg_list) / sizeof(aalg_list[0]); +} + +static inline int ealg_entries(void) +{ + return sizeof(ealg_list) / sizeof(ealg_list[0]); +} + +static inline int calg_entries(void) +{ + return sizeof(calg_list) / sizeof(calg_list[0]); +} + +/* Todo: generic iterators */ +struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id) +{ + int i; + + for (i = 0; i < aalg_entries(); i++) { + if (aalg_list[i].desc.sadb_alg_id == alg_id) { + if (aalg_list[i].available) + return &aalg_list[i]; + else + break; + } + } + return NULL; +} + +struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id) +{ + int i; + + for (i = 0; i < ealg_entries(); i++) { + if (ealg_list[i].desc.sadb_alg_id == alg_id) { + if (ealg_list[i].available) + return &ealg_list[i]; + else + break; + } + } + return NULL; +} + +struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id) +{ + int i; + + for (i = 0; i < calg_entries(); i++) { + if (calg_list[i].desc.sadb_alg_id == alg_id) { + if (calg_list[i].available) + return &calg_list[i]; + else + break; + } + } + return NULL; +} + +struct xfrm_algo_desc *xfrm_aalg_get_byname(char *name) +{ + int i; + + if (!name) + return NULL; + + for (i=0; i < aalg_entries(); i++) { + if (strcmp(name, aalg_list[i].name) == 0) { + if (aalg_list[i].available) + return &aalg_list[i]; + else + break; + } + } + return NULL; +} + +struct xfrm_algo_desc *xfrm_ealg_get_byname(char *name) +{ + int i; + + if (!name) + return NULL; + + for (i=0; i < ealg_entries(); i++) { + if (strcmp(name, ealg_list[i].name) == 0) { + if (ealg_list[i].available) + return &ealg_list[i]; + else + break; + } + } + return NULL; +} + +struct xfrm_algo_desc *xfrm_calg_get_byname(char *name) +{ + int i; + + if (!name) + return NULL; + + for (i=0; i < calg_entries(); i++) { + if (strcmp(name, calg_list[i].name) == 0) { + if (calg_list[i].available) + return &calg_list[i]; + else + break; + } + } + return NULL; +} + +struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx) +{ + if (idx >= aalg_entries()) + return NULL; + + return &aalg_list[idx]; +} + +struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx) +{ + if (idx >= ealg_entries()) + return NULL; + + return &ealg_list[idx]; +} + +struct xfrm_algo_desc *xfrm_calg_get_byidx(unsigned int idx) +{ + if (idx >= calg_entries()) + return NULL; + + return &calg_list[idx]; +} + +/* + * Probe for the availability of crypto algorithms, and set the available + * flag for any algorithms found on the system. This is typically called by + * pfkey during userspace SA add, update or register. + */ +void xfrm_probe_algs(void) +{ +#ifdef CONFIG_CRYPTO + int i, status; + + BUG_ON(in_softirq()); + + for (i = 0; i < aalg_entries(); i++) { + status = crypto_alg_available(aalg_list[i].name, 0); + if (aalg_list[i].available != status) + aalg_list[i].available = status; + } + + for (i = 0; i < ealg_entries(); i++) { + status = crypto_alg_available(ealg_list[i].name, 0); + if (ealg_list[i].available != status) + ealg_list[i].available = status; + } + + for (i = 0; i < calg_entries(); i++) { + status = crypto_alg_available(calg_list[i].name, 0); + if (calg_list[i].available != status) + calg_list[i].available = status; + } +#endif +} + +int xfrm_count_auth_supported(void) +{ + int i, n; + + for (i = 0, n = 0; i < aalg_entries(); i++) + if (aalg_list[i].available) + n++; + return n; +} + +int xfrm_count_enc_supported(void) +{ + int i, n; + + for (i = 0, n = 0; i < ealg_entries(); i++) + if (ealg_list[i].available) + n++; + return n; +} + +/* Move to common area: it is shared with AH. */ + +void skb_icv_walk(const struct sk_buff *skb, struct crypto_tfm *tfm, + int offset, int len, icv_update_fn_t icv_update) +{ + int start = skb->len - skb->data_len; + int i, copy = start - offset; + struct scatterlist sg; + + /* Checksum header. */ + if (copy > 0) { + if (copy > len) + copy = len; + + sg.page = virt_to_page(skb->data + offset); + sg.offset = (unsigned long)(skb->data + offset) % PAGE_SIZE; + sg.length = copy; + + icv_update(tfm, &sg, 1); + + if ((len -= copy) == 0) + return; + offset += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + + sg.page = frag->page; + sg.offset = frag->page_offset + offset-start; + sg.length = copy; + + icv_update(tfm, &sg, 1); + + if (!(len -= copy)) + return; + offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + skb_icv_walk(list, tfm, offset-start, copy, icv_update); + if ((len -= copy) == 0) + return; + offset += copy; + } + start = end; + } + } + if (len) + BUG(); +} + +#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE) + +/* Looking generic it is not used in another places. */ + +int +skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ + int start = skb->len - skb->data_len; + int i, copy = start - offset; + int elt = 0; + + if (copy > 0) { + if (copy > len) + copy = len; + sg[elt].page = virt_to_page(skb->data + offset); + sg[elt].offset = (unsigned long)(skb->data + offset) % PAGE_SIZE; + sg[elt].length = copy; + elt++; + if ((len -= copy) == 0) + return elt; + offset += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + sg[elt].page = frag->page; + sg[elt].offset = frag->page_offset+offset-start; + sg[elt].length = copy; + elt++; + if (!(len -= copy)) + return elt; + offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + elt += skb_to_sgvec(list, sg+elt, offset - start, copy); + if ((len -= copy) == 0) + return elt; + offset += copy; + } + start = end; + } + } + if (len) + BUG(); + return elt; +} + +/* Check that skb data bits are writable. If they are not, copy data + * to newly created private area. If "tailbits" is given, make sure that + * tailbits bytes beyond current end of skb are writable. + * + * Returns amount of elements of scatterlist to load for subsequent + * transformations and pointer to writable trailer skb. + */ + +int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) +{ + int copyflag; + int elt; + struct sk_buff *skb1, **skb_p; + + /* If skb is cloned or its head is paged, reallocate + * head pulling out all the pages (pages are considered not writable + * at the moment even if they are anonymous). + */ + if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && + __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) + return -ENOMEM; + + /* Easy case. Most of packets will go this way. */ + if (!skb_shinfo(skb)->frag_list) { + /* A little of trouble, not enough of space for trailer. + * This should not happen, when stack is tuned to generate + * good frames. OK, on miss we reallocate and reserve even more + * space, 128 bytes is fair. */ + + if (skb_tailroom(skb) < tailbits && + pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) + return -ENOMEM; + + /* Voila! */ + *trailer = skb; + return 1; + } + + /* Misery. We are in troubles, going to mincer fragments... */ + + elt = 1; + skb_p = &skb_shinfo(skb)->frag_list; + copyflag = 0; + + while ((skb1 = *skb_p) != NULL) { + int ntail = 0; + + /* The fragment is partially pulled by someone, + * this can happen on input. Copy it and everything + * after it. */ + + if (skb_shared(skb1)) + copyflag = 1; + + /* If the skb is the last, worry about trailer. */ + + if (skb1->next == NULL && tailbits) { + if (skb_shinfo(skb1)->nr_frags || + skb_shinfo(skb1)->frag_list || + skb_tailroom(skb1) < tailbits) + ntail = tailbits + 128; + } + + if (copyflag || + skb_cloned(skb1) || + ntail || + skb_shinfo(skb1)->nr_frags || + skb_shinfo(skb1)->frag_list) { + struct sk_buff *skb2; + + /* Fuck, we are miserable poor guys... */ + if (ntail == 0) + skb2 = skb_copy(skb1, GFP_ATOMIC); + else + skb2 = skb_copy_expand(skb1, + skb_headroom(skb1), + ntail, + GFP_ATOMIC); + if (unlikely(skb2 == NULL)) + return -ENOMEM; + + if (skb1->sk) + skb_set_owner_w(skb, skb1->sk); + + /* Looking around. Are we still alive? + * OK, link new skb, drop old one */ + + skb2->next = skb1->next; + *skb_p = skb2; + kfree_skb(skb1); + skb1 = skb2; + } + elt++; + *trailer = skb1; + skb_p = &skb1->next; + } + + return elt; +} + +void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +{ + if (tail != skb) { + skb->data_len += len; + skb->len += len; + } + return skb_put(tail, len); +} +#endif Index: net/xfrm/xfrm_export.c =================================================================== RCS file: net/xfrm/xfrm_export.c diff -N net/xfrm/xfrm_export.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/xfrm/xfrm_export.c 16 Apr 2004 13:16:27 -0000 1.1.3.1.20.1 @@ -0,0 +1,75 @@ +#include +#include + +EXPORT_SYMBOL(xfrm_user_policy); +EXPORT_SYMBOL(km_waitq); +EXPORT_SYMBOL(km_new_mapping); +EXPORT_SYMBOL(xfrm_cfg_sem); +EXPORT_SYMBOL(xfrm_policy_alloc); +EXPORT_SYMBOL(__xfrm_policy_destroy); +EXPORT_SYMBOL(xfrm_lookup); +EXPORT_SYMBOL(__xfrm_policy_check); +EXPORT_SYMBOL(__xfrm_route_forward); +EXPORT_SYMBOL(xfrm_state_alloc); +EXPORT_SYMBOL(__xfrm_state_destroy); +EXPORT_SYMBOL(xfrm_state_find); +EXPORT_SYMBOL(xfrm_state_insert); +EXPORT_SYMBOL(xfrm_state_add); +EXPORT_SYMBOL(xfrm_state_update); +EXPORT_SYMBOL(xfrm_state_check_expire); +EXPORT_SYMBOL(xfrm_state_check_space); +EXPORT_SYMBOL(xfrm_state_lookup); +EXPORT_SYMBOL(xfrm_state_register_afinfo); +EXPORT_SYMBOL(xfrm_state_unregister_afinfo); +EXPORT_SYMBOL(xfrm_state_get_afinfo); +EXPORT_SYMBOL(xfrm_state_put_afinfo); +EXPORT_SYMBOL(xfrm_state_delete_tunnel); +EXPORT_SYMBOL(xfrm_replay_check); +EXPORT_SYMBOL(xfrm_replay_advance); +EXPORT_SYMBOL(xfrm_check_selectors); +EXPORT_SYMBOL(xfrm_check_output); +EXPORT_SYMBOL(__secpath_destroy); +EXPORT_SYMBOL(secpath_dup); +EXPORT_SYMBOL(xfrm_get_acqseq); +EXPORT_SYMBOL(xfrm_parse_spi); +EXPORT_SYMBOL(xfrm4_rcv); +EXPORT_SYMBOL(xfrm4_tunnel_register); +EXPORT_SYMBOL(xfrm4_tunnel_deregister); +EXPORT_SYMBOL(xfrm4_tunnel_check_size); +EXPORT_SYMBOL(xfrm_register_type); +EXPORT_SYMBOL(xfrm_unregister_type); +EXPORT_SYMBOL(xfrm_get_type); +EXPORT_SYMBOL(xfrm_register_km); +EXPORT_SYMBOL(xfrm_unregister_km); +EXPORT_SYMBOL(xfrm_state_delete); +EXPORT_SYMBOL(xfrm_state_walk); +EXPORT_SYMBOL(xfrm_find_acq_byseq); +EXPORT_SYMBOL(xfrm_find_acq); +EXPORT_SYMBOL(xfrm_alloc_spi); +EXPORT_SYMBOL(xfrm_state_flush); +EXPORT_SYMBOL(xfrm_policy_kill); +EXPORT_SYMBOL(xfrm_policy_bysel); +EXPORT_SYMBOL(xfrm_policy_insert); +EXPORT_SYMBOL(xfrm_policy_walk); +EXPORT_SYMBOL(xfrm_policy_flush); +EXPORT_SYMBOL(xfrm_policy_byid); +EXPORT_SYMBOL(xfrm_policy_list); +EXPORT_SYMBOL(xfrm_dst_lookup); +EXPORT_SYMBOL(xfrm_policy_register_afinfo); +EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); +EXPORT_SYMBOL(xfrm_policy_get_afinfo); +EXPORT_SYMBOL(xfrm_policy_put_afinfo); + +EXPORT_SYMBOL_GPL(xfrm_probe_algs); +EXPORT_SYMBOL_GPL(xfrm_count_auth_supported); +EXPORT_SYMBOL_GPL(xfrm_count_enc_supported); +EXPORT_SYMBOL_GPL(xfrm_aalg_get_byidx); +EXPORT_SYMBOL_GPL(xfrm_ealg_get_byidx); +EXPORT_SYMBOL_GPL(xfrm_calg_get_byidx); +EXPORT_SYMBOL_GPL(xfrm_aalg_get_byid); +EXPORT_SYMBOL_GPL(xfrm_ealg_get_byid); +EXPORT_SYMBOL_GPL(xfrm_calg_get_byid); +EXPORT_SYMBOL_GPL(xfrm_aalg_get_byname); +EXPORT_SYMBOL_GPL(xfrm_ealg_get_byname); +EXPORT_SYMBOL_GPL(xfrm_calg_get_byname); +EXPORT_SYMBOL_GPL(skb_icv_walk); Index: net/xfrm/xfrm_input.c =================================================================== RCS file: net/xfrm/xfrm_input.c diff -N net/xfrm/xfrm_input.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/xfrm/xfrm_input.c 16 Apr 2004 13:16:27 -0000 1.4.18.1 @@ -0,0 +1,85 @@ +/* + * xfrm_input.c + * + * Changes: + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include +#include +#include + +static kmem_cache_t *secpath_cachep; + +void __secpath_destroy(struct sec_path *sp) +{ + int i; + for (i = 0; i < sp->len; i++) + xfrm_state_put(sp->x[i].xvec); + kmem_cache_free(secpath_cachep, sp); +} + +struct sec_path *secpath_dup(struct sec_path *src) +{ + struct sec_path *sp; + + sp = kmem_cache_alloc(secpath_cachep, SLAB_ATOMIC); + if (!sp) + return NULL; + + sp->len = 0; + if (src) { + int i; + + memcpy(sp, src, sizeof(*sp)); + for (i = 0; i < sp->len; i++) + xfrm_state_hold(sp->x[i].xvec); + } + atomic_set(&sp->refcnt, 1); + return sp; +} + +/* Fetch spi and seq from ipsec header */ + +int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) +{ + int offset, offset_seq; + + switch (nexthdr) { + case IPPROTO_AH: + offset = offsetof(struct ip_auth_hdr, spi); + offset_seq = offsetof(struct ip_auth_hdr, seq_no); + break; + case IPPROTO_ESP: + offset = offsetof(struct ip_esp_hdr, spi); + offset_seq = offsetof(struct ip_esp_hdr, seq_no); + break; + case IPPROTO_COMP: + if (!pskb_may_pull(skb, sizeof(struct ip_comp_hdr))) + return -EINVAL; + *spi = ntohl(ntohs(*(u16*)(skb->h.raw + 2))); + *seq = 0; + return 0; + default: + return 1; + } + + if (!pskb_may_pull(skb, 16)) + return -EINVAL; + + *spi = *(u32*)(skb->h.raw + offset); + *seq = *(u32*)(skb->h.raw + offset_seq); + return 0; +} + +void __init xfrm_input_init(void) +{ + secpath_cachep = kmem_cache_create("secpath_cache", + sizeof(struct sec_path), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!secpath_cachep) + panic("XFRM: failed to allocate secpath_cache\n"); +} Index: net/xfrm/xfrm_output.c =================================================================== RCS file: net/xfrm/xfrm_output.c diff -N net/xfrm/xfrm_output.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/xfrm/xfrm_output.c 16 Apr 2004 13:16:27 -0000 1.2.18.1 @@ -0,0 +1,46 @@ +/* + * generic xfrm output routines + * + * Copyright (c) 2003 James Morris + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ +#include +#include +#include +#include + +int xfrm_check_output(struct xfrm_state *x, + struct sk_buff *skb, unsigned short family) +{ + int err; + + err = xfrm_state_check_expire(x); + if (err) + goto out; + + if (x->props.mode) { + switch (family) { + case AF_INET: + err = xfrm4_tunnel_check_size(skb); + break; + + case AF_INET6: + err = xfrm6_tunnel_check_size(skb); + break; + + default: + err = -EINVAL; + } + + if (err) + goto out; + } + + err = xfrm_state_check_space(x, skb); +out: + return err; +} Index: net/xfrm/xfrm_policy.c =================================================================== RCS file: net/xfrm/xfrm_policy.c diff -N net/xfrm/xfrm_policy.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/xfrm/xfrm_policy.c 16 Apr 2004 13:16:27 -0000 1.12.2.1 @@ -0,0 +1,1250 @@ +/* + * xfrm_policy.c + * + * Changes: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * Kazunori MIYAZAWA @USAGI + * YOSHIFUJI Hideaki + * Split up af-specific portion + * Derek Atkins Add the post_input processor + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DECLARE_MUTEX(xfrm_cfg_sem); + +static rwlock_t xfrm_policy_lock = RW_LOCK_UNLOCKED; + +struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2]; + +static rwlock_t xfrm_policy_afinfo_lock = RW_LOCK_UNLOCKED; +static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO]; + +kmem_cache_t *xfrm_dst_cache; + +static struct tq_struct xfrm_policy_gc_work; +static struct list_head xfrm_policy_gc_list = + LIST_HEAD_INIT(xfrm_policy_gc_list); +static spinlock_t xfrm_policy_gc_lock = SPIN_LOCK_UNLOCKED; + +int xfrm_register_type(struct xfrm_type *type, unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + struct xfrm_type_map *typemap; + int err = 0; + + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + typemap = afinfo->type_map; + + write_lock(&typemap->lock); + if (likely(typemap->map[type->proto] == NULL)) + typemap->map[type->proto] = type; + else + err = -EEXIST; + write_unlock(&typemap->lock); + xfrm_policy_put_afinfo(afinfo); + return err; +} + +int xfrm_unregister_type(struct xfrm_type *type, unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + struct xfrm_type_map *typemap; + int err = 0; + + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + typemap = afinfo->type_map; + + write_lock(&typemap->lock); + if (unlikely(typemap->map[type->proto] != type)) + err = -ENOENT; + else + typemap->map[type->proto] = NULL; + write_unlock(&typemap->lock); + xfrm_policy_put_afinfo(afinfo); + return err; +} + +struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_type_map *typemap; + struct xfrm_type *type; + int modload_attempted = 0; + +retry: + afinfo = xfrm_policy_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return NULL; + typemap = afinfo->type_map; + + read_lock(&typemap->lock); + type = typemap->map[proto]; + if (type && type->owner) + __MOD_INC_USE_COUNT(type->owner); + read_unlock(&typemap->lock); + if (!type && !modload_attempted) { + char module_name[36]; + + xfrm_policy_put_afinfo(afinfo); + sprintf(module_name, "xfrm-type-%d-%d", + (int) family, (int) proto); + request_module(module_name); + modload_attempted = 1; + goto retry; + } + + xfrm_policy_put_afinfo(afinfo); + return type; +} + +int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, + unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + int err = 0; + + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + if (likely(afinfo->dst_lookup != NULL)) + err = afinfo->dst_lookup(dst, fl); + else + err = -EINVAL; + xfrm_policy_put_afinfo(afinfo); + return err; +} + +void xfrm_put_type(struct xfrm_type *type) +{ + if (type->owner) + __MOD_DEC_USE_COUNT(type->owner); +} + +static inline unsigned long make_jiffies(long secs) +{ + if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) + return MAX_SCHEDULE_TIMEOUT-1; + else + return secs*HZ; +} + +static void xfrm_policy_timer(unsigned long data) +{ + struct xfrm_policy *xp = (struct xfrm_policy*)data; + unsigned long now = (unsigned long)xtime.tv_sec; + long next = LONG_MAX; + int warn = 0; + int dir; + + if (xp->dead) + goto out; + + dir = xp->index & 7; + + if (xp->lft.hard_add_expires_seconds) { + long tmo = xp->lft.hard_add_expires_seconds + + xp->curlft.add_time - now; + if (tmo <= 0) + goto expired; + if (tmo < next) + next = tmo; + } + if (xp->lft.hard_use_expires_seconds) { + long tmo = xp->lft.hard_use_expires_seconds + + (xp->curlft.use_time ? : xp->curlft.add_time) - now; + if (tmo <= 0) + goto expired; + if (tmo < next) + next = tmo; + } + if (xp->lft.soft_add_expires_seconds) { + long tmo = xp->lft.soft_add_expires_seconds + + xp->curlft.add_time - now; + if (tmo <= 0) { + warn = 1; + tmo = XFRM_KM_TIMEOUT; + } + if (tmo < next) + next = tmo; + } + if (xp->lft.soft_use_expires_seconds) { + long tmo = xp->lft.soft_use_expires_seconds + + (xp->curlft.use_time ? : xp->curlft.add_time) - now; + if (tmo <= 0) { + warn = 1; + tmo = XFRM_KM_TIMEOUT; + } + if (tmo < next) + next = tmo; + } + + if (warn) + km_policy_expired(xp, dir, 0); + if (next != LONG_MAX && + !mod_timer(&xp->timer, jiffies + make_jiffies(next))) + xfrm_pol_hold(xp); + +out: + xfrm_pol_put(xp); + return; + +expired: + km_policy_expired(xp, dir, 1); + xfrm_policy_delete(xp, dir); + xfrm_pol_put(xp); +} + + +/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2 + * SPD calls. + */ + +struct xfrm_policy *xfrm_policy_alloc(int gfp) +{ + struct xfrm_policy *policy; + + policy = kmalloc(sizeof(struct xfrm_policy), gfp); + + if (policy) { + memset(policy, 0, sizeof(struct xfrm_policy)); + atomic_set(&policy->refcnt, 1); + policy->lock = RW_LOCK_UNLOCKED; + init_timer(&policy->timer); + policy->timer.data = (unsigned long)policy; + policy->timer.function = xfrm_policy_timer; + } + return policy; +} + +/* Destroy xfrm_policy: descendant resources must be released to this moment. */ + +void __xfrm_policy_destroy(struct xfrm_policy *policy) +{ + if (!policy->dead) + BUG(); + + if (policy->bundles) + BUG(); + + if (del_timer(&policy->timer)) + BUG(); + + kfree(policy); +} + +static void xfrm_policy_gc_kill(struct xfrm_policy *policy) +{ + struct dst_entry *dst; + + while ((dst = policy->bundles) != NULL) { + policy->bundles = dst->next; + dst_free(dst); + } + + if (del_timer(&policy->timer)) + atomic_dec(&policy->refcnt); + + if (atomic_read(&policy->refcnt) > 1) + flow_cache_flush(); + + xfrm_pol_put(policy); +} + +static void xfrm_policy_gc_task(void *data) +{ + struct xfrm_policy *policy; + struct list_head *entry, *tmp; + struct list_head gc_list = LIST_HEAD_INIT(gc_list); + + spin_lock_bh(&xfrm_policy_gc_lock); + list_splice_init(&xfrm_policy_gc_list, &gc_list); + spin_unlock_bh(&xfrm_policy_gc_lock); + + list_for_each_safe(entry, tmp, &gc_list) { + policy = list_entry(entry, struct xfrm_policy, list); + xfrm_policy_gc_kill(policy); + } +} + +/* Rule must be locked. Release descentant resources, announce + * entry dead. The rule must be unlinked from lists to the moment. + */ + +void xfrm_policy_kill(struct xfrm_policy *policy) +{ + write_lock_bh(&policy->lock); + if (policy->dead) + goto out; + + policy->dead = 1; + + spin_lock(&xfrm_policy_gc_lock); + list_add(&policy->list, &xfrm_policy_gc_list); + spin_unlock(&xfrm_policy_gc_lock); + schedule_task(&xfrm_policy_gc_work); + +out: + write_unlock_bh(&policy->lock); +} + +/* Generate new index... KAME seems to generate them ordered by cost + * of an absolute inpredictability of ordering of rules. This will not pass. */ +static u32 xfrm_gen_index(int dir) +{ + u32 idx; + struct xfrm_policy *p; + static u32 idx_generator; + + for (;;) { + idx = (idx_generator | dir); + idx_generator += 8; + if (idx == 0) + idx = 8; + for (p = xfrm_policy_list[dir]; p; p = p->next) { + if (p->index == idx) + break; + } + if (!p) + return idx; + } +} + +int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) +{ + struct xfrm_policy *pol, **p; + struct xfrm_policy *delpol = NULL; + struct xfrm_policy **newpos = NULL; + + write_lock_bh(&xfrm_policy_lock); + for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) { + if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) { + if (excl) { + write_unlock_bh(&xfrm_policy_lock); + return -EEXIST; + } + *p = pol->next; + delpol = pol; + if (policy->priority > pol->priority) + continue; + } else if (policy->priority >= pol->priority) + continue; + if (!newpos) + newpos = p; + if (delpol) + break; + } + if (newpos) + p = newpos; + xfrm_pol_hold(policy); + policy->next = *p; + *p = policy; + atomic_inc(&flow_cache_genid); + policy->index = delpol ? delpol->index : xfrm_gen_index(dir); + policy->curlft.add_time = (unsigned long)xtime.tv_sec; + policy->curlft.use_time = 0; + if (!mod_timer(&policy->timer, jiffies + HZ)) + xfrm_pol_hold(policy); + write_unlock_bh(&xfrm_policy_lock); + + if (delpol) { + xfrm_policy_kill(delpol); + } + wake_up(&km_waitq); + return 0; +} + +struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, + int delete) +{ + struct xfrm_policy *pol, **p; + + write_lock_bh(&xfrm_policy_lock); + for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) { + if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) { + xfrm_pol_hold(pol); + if (delete) + *p = pol->next; + break; + } + } + write_unlock_bh(&xfrm_policy_lock); + + if (pol && delete) { + atomic_inc(&flow_cache_genid); + xfrm_policy_kill(pol); + wake_up(&km_waitq); + } + return pol; +} + +struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete) +{ + struct xfrm_policy *pol, **p; + + write_lock_bh(&xfrm_policy_lock); + for (p = &xfrm_policy_list[id & 7]; (pol=*p)!=NULL; p = &pol->next) { + if (pol->index == id) { + xfrm_pol_hold(pol); + if (delete) + *p = pol->next; + break; + } + } + write_unlock_bh(&xfrm_policy_lock); + + if (pol && delete) { + atomic_inc(&flow_cache_genid); + xfrm_policy_kill(pol); + wake_up(&km_waitq); + } + return pol; +} + +void xfrm_policy_flush() +{ + struct xfrm_policy *xp; + int dir; + + write_lock_bh(&xfrm_policy_lock); + for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { + while ((xp = xfrm_policy_list[dir]) != NULL) { + xfrm_policy_list[dir] = xp->next; + write_unlock_bh(&xfrm_policy_lock); + + xfrm_policy_kill(xp); + + write_lock_bh(&xfrm_policy_lock); + } + } + atomic_inc(&flow_cache_genid); + write_unlock_bh(&xfrm_policy_lock); + wake_up(&km_waitq); +} + +int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), + void *data) +{ + struct xfrm_policy *xp; + int dir; + int count = 0; + int error = 0; + + read_lock_bh(&xfrm_policy_lock); + for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) { + for (xp = xfrm_policy_list[dir]; xp; xp = xp->next) + count++; + } + + if (count == 0) { + error = -ENOENT; + goto out; + } + + for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) { + for (xp = xfrm_policy_list[dir]; xp; xp = xp->next) { + error = func(xp, dir%XFRM_POLICY_MAX, --count, data); + if (error) + goto out; + } + } + +out: + read_unlock_bh(&xfrm_policy_lock); + return error; +} + + +/* Find policy to apply to this flow. */ + +static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, + void **objp, atomic_t **obj_refp) +{ + struct xfrm_policy *pol; + + read_lock_bh(&xfrm_policy_lock); + for (pol = xfrm_policy_list[dir]; pol; pol = pol->next) { + struct xfrm_selector *sel = &pol->selector; + int match; + + if (pol->family != family) + continue; + + match = xfrm_selector_match(sel, fl, family); + if (match) { + xfrm_pol_hold(pol); + break; + } + } + read_unlock_bh(&xfrm_policy_lock); + if ((*objp = (void *) pol) != NULL) + *obj_refp = &pol->refcnt; +} + +struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl) +{ + struct xfrm_policy *pol; + + read_lock_bh(&xfrm_policy_lock); + if ((pol = sk->policy[dir]) != NULL) { + int match; + + match = xfrm_selector_match(&pol->selector, fl, sk->family); + if (match) + xfrm_pol_hold(pol); + else + pol = NULL; + } + read_unlock_bh(&xfrm_policy_lock); + return pol; +} + +static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) +{ + pol->next = xfrm_policy_list[dir]; + xfrm_policy_list[dir] = pol; + xfrm_pol_hold(pol); +} + +static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, + int dir) +{ + struct xfrm_policy **polp; + + for (polp = &xfrm_policy_list[dir]; + *polp != NULL; polp = &(*polp)->next) { + if (*polp == pol) { + *polp = pol->next; + return pol; + } + } + return NULL; +} + +void xfrm_policy_delete(struct xfrm_policy *pol, int dir) +{ + write_lock_bh(&xfrm_policy_lock); + pol = __xfrm_policy_unlink(pol, dir); + write_unlock_bh(&xfrm_policy_lock); + if (pol) + xfrm_policy_kill(pol); +} + +int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) +{ + struct xfrm_policy *old_pol; + + write_lock_bh(&xfrm_policy_lock); + old_pol = sk->policy[dir]; + sk->policy[dir] = pol; + if (pol) { + pol->curlft.add_time = (unsigned long)xtime.tv_sec; + pol->index = xfrm_gen_index(XFRM_POLICY_MAX+dir); + __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir); + } + if (old_pol) + __xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir); + write_unlock_bh(&xfrm_policy_lock); + + if (old_pol) { + xfrm_policy_kill(old_pol); + } + wake_up(&km_waitq); + return 0; +} + +static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir) +{ + struct xfrm_policy *newp = xfrm_policy_alloc(GFP_ATOMIC); + + if (newp) { + newp->selector = old->selector; + newp->lft = old->lft; + newp->curlft = old->curlft; + newp->action = old->action; + newp->flags = old->flags; + newp->xfrm_nr = old->xfrm_nr; + newp->index = old->index; + memcpy(newp->xfrm_vec, old->xfrm_vec, + newp->xfrm_nr*sizeof(struct xfrm_tmpl)); + write_lock_bh(&xfrm_policy_lock); + __xfrm_policy_link(newp, XFRM_POLICY_MAX+dir); + write_unlock_bh(&xfrm_policy_lock); + xfrm_pol_put(newp); + } + return newp; +} + +int __xfrm_sk_clone_policy(struct sock *sk) +{ + struct xfrm_policy *p0, *p1; + p0 = sk->policy[0]; + p1 = sk->policy[1]; + sk->policy[0] = NULL; + sk->policy[1] = NULL; + if (p0 && (sk->policy[0] = clone_policy(p0, 0)) == NULL) + return -ENOMEM; + if (p1 && (sk->policy[1] = clone_policy(p1, 1)) == NULL) + return -ENOMEM; + return 0; +} + +/* Resolve list of templates for the flow, given policy. */ + +static int +xfrm_tmpl_resolve(struct xfrm_policy *policy, struct flowi *fl, + struct xfrm_state **xfrm, + unsigned short family) +{ + int nx; + int i, error; + xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family); + xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family); + + for (nx=0, i = 0; i < policy->xfrm_nr; i++) { + struct xfrm_state *x; + xfrm_address_t *remote = daddr; + xfrm_address_t *local = saddr; + struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; + + if (tmpl->mode) { + remote = &tmpl->id.daddr; + local = &tmpl->saddr; + } + + x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family); + + if (x && x->km.state == XFRM_STATE_VALID) { + xfrm[nx++] = x; + daddr = remote; + saddr = local; + continue; + } + if (x) { + error = (x->km.state == XFRM_STATE_ERROR ? + -EINVAL : -EAGAIN); + xfrm_state_put(x); + } + + if (!tmpl->optional) + goto fail; + } + return nx; + +fail: + for (nx--; nx>=0; nx--) + xfrm_state_put(xfrm[nx]); + return error; +} + +/* Check that the bundle accepts the flow and its components are + * still valid. + */ + +static struct dst_entry * +xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family) +{ + struct dst_entry *x; + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return ERR_PTR(-EINVAL); + x = afinfo->find_bundle(fl, policy); + xfrm_policy_put_afinfo(afinfo); + return x; +} + +/* Allocate chain of dst_entry's, attach known xfrm's, calculate + * all the metrics... Shortly, bundle a bundle. + */ + +static int +xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, + struct flowi *fl, struct dst_entry **dst_p, + unsigned short family) +{ + int err; + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return -EINVAL; + err = afinfo->bundle_create(policy, xfrm, nx, fl, dst_p); + xfrm_policy_put_afinfo(afinfo); + return err; +} + +static inline int policy_to_flow_dir(int dir) +{ + if (XFRM_POLICY_IN == FLOW_DIR_IN && + XFRM_POLICY_OUT == FLOW_DIR_OUT && + XFRM_POLICY_FWD == FLOW_DIR_FWD) + return dir; + switch (dir) { + default: + case XFRM_POLICY_IN: + return FLOW_DIR_IN; + case XFRM_POLICY_OUT: + return FLOW_DIR_OUT; + case XFRM_POLICY_FWD: + return FLOW_DIR_FWD; + }; +} + +static int stale_bundle(struct dst_entry *dst); + +/* Main function: finds/creates a bundle for given flow. + * + * At the moment we eat a raw IP route. Mostly to speed up lookups + * on interfaces with disabled IPsec. + */ +int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, + struct sock *sk, int flags) +{ + struct xfrm_policy *policy; + struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; + struct rtable *rt = (struct rtable*)*dst_p; + struct dst_entry *dst; + int nx = 0; + int err; + u32 genid; + u16 family = (*dst_p)->ops->family; + + switch (family) { + case AF_INET: + if (!fl->fl4_src) + fl->fl4_src = rt->rt_src; + if (!fl->fl4_dst) + fl->fl4_dst = rt->rt_dst; + case AF_INET6: + /* Still not clear... */ + default: + /* nothing */; + } + +restart: + genid = atomic_read(&flow_cache_genid); + policy = NULL; + if (sk && sk->policy[1]) + policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); + + if (!policy) { + /* To accelerate a bit... */ + if ((rt->u.dst.flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT]) + return 0; + + policy = flow_cache_lookup(fl, family, + policy_to_flow_dir(XFRM_POLICY_OUT), + xfrm_policy_lookup); + } + + if (!policy) + return 0; + + policy->curlft.use_time = (unsigned long)xtime.tv_sec; + + switch (policy->action) { + case XFRM_POLICY_BLOCK: + /* Prohibit the flow */ + xfrm_pol_put(policy); + return -EPERM; + + case XFRM_POLICY_ALLOW: + if (policy->xfrm_nr == 0) { + /* Flow passes not transformed. */ + xfrm_pol_put(policy); + return 0; + } + + /* Try to find matching bundle. + * + * LATER: help from flow cache. It is optional, this + * is required only for output policy. + */ + dst = xfrm_find_bundle(fl, policy, family); + if (IS_ERR(dst)) { + xfrm_pol_put(policy); + return PTR_ERR(dst); + } + + if (dst) + break; + + nx = xfrm_tmpl_resolve(policy, fl, xfrm, family); + + if (unlikely(nx<0)) { + err = nx; + if (err == -EAGAIN && flags) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&km_waitq, &wait); + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + set_current_state(TASK_RUNNING); + remove_wait_queue(&km_waitq, &wait); + + nx = xfrm_tmpl_resolve(policy, fl, xfrm, family); + + if (nx == -EAGAIN && signal_pending(current)) { + err = -ERESTART; + goto error; + } + if (nx == -EAGAIN || + genid != atomic_read(&flow_cache_genid)) { + xfrm_pol_put(policy); + goto restart; + } + err = nx; + } + if (err < 0) + goto error; + } + if (nx == 0) { + /* Flow passes not transformed. */ + xfrm_pol_put(policy); + return 0; + } + + dst = &rt->u.dst; + err = xfrm_bundle_create(policy, xfrm, nx, fl, &dst, family); + + if (unlikely(err)) { + int i; + for (i=0; ilock); + if (unlikely(policy->dead || stale_bundle(dst))) { + /* Wow! While we worked on resolving, this + * policy has gone. Retry. It is not paranoia, + * we just cannot enlist new bundle to dead object. + * We can't enlist stable bundles either. + */ + write_unlock_bh(&policy->lock); + + xfrm_pol_put(policy); + if (dst) + dst_free(dst); + goto restart; + } + dst->next = policy->bundles; + policy->bundles = dst; + dst_hold(dst); + write_unlock_bh(&policy->lock); + } + *dst_p = dst; + ip_rt_put(rt); + xfrm_pol_put(policy); + return 0; + +error: + ip_rt_put(rt); + xfrm_pol_put(policy); + *dst_p = NULL; + return err; +} + +/* When skb is transformed back to its "native" form, we have to + * check policy restrictions. At the moment we make this in maximally + * stupid way. Shame on me. :-) Of course, connected sockets must + * have policy cached at them. + */ + +static inline int +xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x, + unsigned short family) +{ + if (xfrm_state_kern(x)) + return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, family); + return x->id.proto == tmpl->id.proto && + (x->id.spi == tmpl->id.spi || !tmpl->id.spi) && + (x->props.reqid == tmpl->reqid || !tmpl->reqid) && + x->props.mode == tmpl->mode && + (tmpl->aalgos & (1<props.aalgo)) && + !(x->props.mode && xfrm_state_addr_cmp(tmpl, x, family)); +} + +static inline int +xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start, + unsigned short family) +{ + int idx = start; + + if (tmpl->optional) { + if (!tmpl->mode) + return start; + } else + start = -1; + for (; idx < sp->len; idx++) { + if (xfrm_state_ok(tmpl, sp->x[idx].xvec, family)) + return ++idx; + if (sp->x[idx].xvec->props.mode) + break; + } + return start; +} + +static int +_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + afinfo->decode_session(skb, fl); + xfrm_policy_put_afinfo(afinfo); + return 0; +} + +int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, + unsigned short family) +{ + struct xfrm_policy *pol; + struct flowi fl; + + if (_decode_session(skb, &fl, family) < 0) + return 0; + + /* First, check used SA against their selectors. */ + if (skb->sp) { + int i; + + for (i=skb->sp->len-1; i>=0; i--) { + struct sec_decap_state *xvec = &(skb->sp->x[i]); + if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family)) + return 0; + + /* If there is a post_input processor, try running it */ + if (xvec->xvec->type->post_input && + (xvec->xvec->type->post_input)(xvec->xvec, + &(xvec->decap), + skb) != 0) + return 0; + } + } + + pol = NULL; + if (sk && sk->policy[dir]) + pol = xfrm_sk_policy_lookup(sk, dir, &fl); + + if (!pol) + pol = flow_cache_lookup(&fl, family, + policy_to_flow_dir(dir), + xfrm_policy_lookup); + + if (!pol) + return !skb->sp; + + pol->curlft.use_time = (unsigned long)xtime.tv_sec; + + if (pol->action == XFRM_POLICY_ALLOW) { + struct sec_path *sp; + static struct sec_path dummy; + int i, k; + + if ((sp = skb->sp) == NULL) + sp = &dummy; + + /* For each tunnel xfrm, find the first matching tmpl. + * For each tmpl before that, find corresponding xfrm. + * Order is _important_. Later we will implement + * some barriers, but at the moment barriers + * are implied between each two transformations. + */ + for (i = pol->xfrm_nr-1, k = 0; i >= 0; i--) { + k = xfrm_policy_ok(pol->xfrm_vec+i, sp, k, family); + if (k < 0) + goto reject; + } + + for (; k < sp->len; k++) { + if (sp->x[k].xvec->props.mode) + goto reject; + } + + xfrm_pol_put(pol); + return 1; + } + +reject: + xfrm_pol_put(pol); + return 0; +} + +int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) +{ + struct flowi fl; + + if (_decode_session(skb, &fl, family) < 0) + return 0; + + return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0; +} + +/* Optimize later using cookies and generation ids. */ + +static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) +{ + if (!stale_bundle(dst)) + return dst; + + dst_release(dst); + return NULL; +} + +static int stale_bundle(struct dst_entry *dst) +{ + struct dst_entry *child = dst; + + while (child) { + if (child->obsolete > 0 || + (child->dev && !netif_running(child->dev)) || + (child->xfrm && child->xfrm->km.state != XFRM_STATE_VALID)) { + return 1; + } + child = child->child; + } + + return 0; +} + +static void xfrm_dst_destroy(struct dst_entry *dst) +{ + xfrm_state_put(dst->xfrm); + dst->xfrm = NULL; +} + +static void xfrm_link_failure(struct sk_buff *skb) +{ + /* Impossible. Such dst must be popped before reaches point of failure. */ + return; +} + +static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) +{ + if (dst) { + if (dst->obsolete) { + dst_release(dst); + dst = NULL; + } + } + return dst; +} + +static void xfrm_prune_bundles(int (*func)(struct dst_entry *)) +{ + int i; + struct xfrm_policy *pol; + struct dst_entry *dst, **dstp, *gc_list = NULL; + + read_lock_bh(&xfrm_policy_lock); + for (i=0; i<2*XFRM_POLICY_MAX; i++) { + for (pol = xfrm_policy_list[i]; pol; pol = pol->next) { + write_lock(&pol->lock); + dstp = &pol->bundles; + while ((dst=*dstp) != NULL) { + if (func(dst)) { + *dstp = dst->next; + dst->next = gc_list; + gc_list = dst; + } else { + dstp = &dst->next; + } + } + write_unlock(&pol->lock); + } + } + read_unlock_bh(&xfrm_policy_lock); + + while (gc_list) { + dst = gc_list; + gc_list = dst->next; + dst_free(dst); + } +} + +static int unused_bundle(struct dst_entry *dst) +{ + return !atomic_read(&dst->__refcnt); +} + +static void __xfrm_garbage_collect(void) +{ + xfrm_prune_bundles(unused_bundle); +} + +int xfrm_flush_bundles(void) +{ + xfrm_prune_bundles(stale_bundle); + return 0; +} + +/* Well... that's _TASK_. We need to scan through transformation + * list and figure out what mss tcp should generate in order to + * final datagram fit to mtu. Mama mia... :-) + * + * Apparently, some easy way exists, but we used to choose the most + * bizarre ones. :-) So, raising Kalashnikov... tra-ta-ta. + * + * Consider this function as something like dark humour. :-) + */ +static int xfrm_get_mss(struct dst_entry *dst, u32 mtu) +{ + int res = mtu - dst->header_len; + + for (;;) { + struct dst_entry *d = dst; + int m = res; + + do { + struct xfrm_state *x = d->xfrm; + if (x) { + spin_lock_bh(&x->lock); + if (x->km.state == XFRM_STATE_VALID && + x->type && x->type->get_max_size) + m = x->type->get_max_size(d->xfrm, m); + else + m += x->props.header_len; + spin_unlock_bh(&x->lock); + } + } while ((d = d->child) != NULL); + + if (m <= mtu) + break; + res -= (m - mtu); + if (res < 88) + return mtu; + } + + return res + dst->header_len; +} + +int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + int err = 0; + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + write_lock(&xfrm_policy_afinfo_lock); + if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL)) + err = -ENOBUFS; + else { + struct dst_ops *dst_ops = afinfo->dst_ops; + if (likely(dst_ops->kmem_cachep == NULL)) + dst_ops->kmem_cachep = xfrm_dst_cache; + if (likely(dst_ops->check == NULL)) + dst_ops->check = xfrm_dst_check; + if (likely(dst_ops->destroy == NULL)) + dst_ops->destroy = xfrm_dst_destroy; + if (likely(dst_ops->negative_advice == NULL)) + dst_ops->negative_advice = xfrm_negative_advice; + if (likely(dst_ops->link_failure == NULL)) + dst_ops->link_failure = xfrm_link_failure; + if (likely(dst_ops->get_mss == NULL)) + dst_ops->get_mss = xfrm_get_mss; + if (likely(afinfo->garbage_collect == NULL)) + afinfo->garbage_collect = __xfrm_garbage_collect; + xfrm_policy_afinfo[afinfo->family] = afinfo; + } + write_unlock(&xfrm_policy_afinfo_lock); + return err; +} + +int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + int err = 0; + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + write_lock(&xfrm_policy_afinfo_lock); + if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) { + if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo)) + err = -EINVAL; + else { + struct dst_ops *dst_ops = afinfo->dst_ops; + xfrm_policy_afinfo[afinfo->family] = NULL; + dst_ops->kmem_cachep = NULL; + dst_ops->check = NULL; + dst_ops->destroy = NULL; + dst_ops->negative_advice = NULL; + dst_ops->link_failure = NULL; + dst_ops->get_mss = NULL; + afinfo->garbage_collect = NULL; + } + } + write_unlock(&xfrm_policy_afinfo_lock); + return err; +} + +struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo; + if (unlikely(family >= NPROTO)) + return NULL; + read_lock(&xfrm_policy_afinfo_lock); + afinfo = xfrm_policy_afinfo[family]; + if (likely(afinfo != NULL)) + read_lock(&afinfo->lock); + read_unlock(&xfrm_policy_afinfo_lock); + return afinfo; +} + +void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + if (unlikely(afinfo == NULL)) + return; + read_unlock(&afinfo->lock); +} + +static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + switch (event) { + case NETDEV_DOWN: + xfrm_flush_bundles(); + } + return NOTIFY_DONE; +} + +struct notifier_block xfrm_dev_notifier = { + xfrm_dev_event, + NULL, + 0 +}; + +void __init xfrm_policy_init(void) +{ + xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache", + sizeof(struct xfrm_dst), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!xfrm_dst_cache) + panic("XFRM: failed to allocate xfrm_dst_cache\n"); + + INIT_TQUEUE(&xfrm_policy_gc_work, xfrm_policy_gc_task, NULL); + register_netdevice_notifier(&xfrm_dev_notifier); +} + +void __init xfrm_init(void) +{ + xfrm_state_init(); + xfrm_policy_init(); + xfrm_input_init(); +} + Index: net/xfrm/xfrm_state.c =================================================================== RCS file: net/xfrm/xfrm_state.c diff -N net/xfrm/xfrm_state.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/net/xfrm/xfrm_state.c 16 Apr 2004 13:16:27 -0000 1.8.8.1 @@ -0,0 +1,942 @@ +/* + * xfrm_state.c + * + * Changes: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific functions + * Derek Atkins + * Add UDP Encapsulation + * + */ + +#include +#include +#include +#include +#include + +/* Each xfrm_state may be linked to two tables: + + 1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl) + 2. Hash table by daddr to find what SAs exist for given + destination/tunnel endpoint. (output) + */ + +static spinlock_t xfrm_state_lock = SPIN_LOCK_UNLOCKED; + +/* Hash table to find appropriate SA towards given target (endpoint + * of tunnel or destination of transport mode) allowed by selector. + * + * Main use is finding SA after policy selected tunnel or transport mode. + * Also, it can be used by ah/esp icmp error handler to find offending SA. + */ +static struct list_head xfrm_state_bydst[XFRM_DST_HSIZE]; +static struct list_head xfrm_state_byspi[XFRM_DST_HSIZE]; + +DECLARE_WAIT_QUEUE_HEAD(km_waitq); + +static rwlock_t xfrm_state_afinfo_lock = RW_LOCK_UNLOCKED; +static struct xfrm_state_afinfo *xfrm_state_afinfo[NPROTO]; + +static struct tq_struct xfrm_state_gc_work; +static struct list_head xfrm_state_gc_list = LIST_HEAD_INIT(xfrm_state_gc_list); +static spinlock_t xfrm_state_gc_lock = SPIN_LOCK_UNLOCKED; + +static void __xfrm_state_delete(struct xfrm_state *x); + +static void xfrm_state_gc_destroy(struct xfrm_state *x) +{ + if (del_timer(&x->timer)) + BUG(); + if (x->aalg) + kfree(x->aalg); + if (x->ealg) + kfree(x->ealg); + if (x->calg) + kfree(x->calg); + if (x->encap) + kfree(x->encap); + if (x->type) { + x->type->destructor(x); + xfrm_put_type(x->type); + } + kfree(x); + wake_up(&km_waitq); +} + +static void xfrm_state_gc_task(void *data) +{ + struct xfrm_state *x; + struct list_head *entry, *tmp; + struct list_head gc_list = LIST_HEAD_INIT(gc_list); + + spin_lock_bh(&xfrm_state_gc_lock); + list_splice_init(&xfrm_state_gc_list, &gc_list); + spin_unlock_bh(&xfrm_state_gc_lock); + + list_for_each_safe(entry, tmp, &gc_list) { + x = list_entry(entry, struct xfrm_state, bydst); + xfrm_state_gc_destroy(x); + } +} + +static inline unsigned long make_jiffies(long secs) +{ + if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) + return MAX_SCHEDULE_TIMEOUT-1; + else + return secs*HZ; +} + +static void xfrm_timer_handler(unsigned long data) +{ + struct xfrm_state *x = (struct xfrm_state*)data; + unsigned long now = (unsigned long)xtime.tv_sec; + long next = LONG_MAX; + int warn = 0; + + spin_lock(&x->lock); + if (x->km.state == XFRM_STATE_DEAD) + goto out; + if (x->km.state == XFRM_STATE_EXPIRED) + goto expired; + if (x->lft.hard_add_expires_seconds) { + long tmo = x->lft.hard_add_expires_seconds + + x->curlft.add_time - now; + if (tmo <= 0) + goto expired; + if (tmo < next) + next = tmo; + } + if (x->lft.hard_use_expires_seconds) { + long tmo = x->lft.hard_use_expires_seconds + + (x->curlft.use_time ? : now) - now; + if (tmo <= 0) + goto expired; + if (tmo < next) + next = tmo; + } + if (x->km.dying) + goto resched; + if (x->lft.soft_add_expires_seconds) { + long tmo = x->lft.soft_add_expires_seconds + + x->curlft.add_time - now; + if (tmo <= 0) + warn = 1; + else if (tmo < next) + next = tmo; + } + if (x->lft.soft_use_expires_seconds) { + long tmo = x->lft.soft_use_expires_seconds + + (x->curlft.use_time ? : now) - now; + if (tmo <= 0) + warn = 1; + else if (tmo < next) + next = tmo; + } + + if (warn) + km_state_expired(x, 0); +resched: + if (next != LONG_MAX && + !mod_timer(&x->timer, jiffies + make_jiffies(next))) + xfrm_state_hold(x); + goto out; + +expired: + if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0) { + x->km.state = XFRM_STATE_EXPIRED; + wake_up(&km_waitq); + next = 2; + goto resched; + } + if (x->id.spi != 0) + km_state_expired(x, 1); + __xfrm_state_delete(x); + +out: + spin_unlock(&x->lock); + xfrm_state_put(x); +} + +struct xfrm_state *xfrm_state_alloc(void) +{ + struct xfrm_state *x; + + x = kmalloc(sizeof(struct xfrm_state), GFP_ATOMIC); + + if (x) { + memset(x, 0, sizeof(struct xfrm_state)); + atomic_set(&x->refcnt, 1); + atomic_set(&x->tunnel_users, 0); + INIT_LIST_HEAD(&x->bydst); + INIT_LIST_HEAD(&x->byspi); + init_timer(&x->timer); + x->timer.function = xfrm_timer_handler; + x->timer.data = (unsigned long)x; + x->curlft.add_time = (unsigned long)xtime.tv_sec; + x->lft.soft_byte_limit = XFRM_INF; + x->lft.soft_packet_limit = XFRM_INF; + x->lft.hard_byte_limit = XFRM_INF; + x->lft.hard_packet_limit = XFRM_INF; + x->lock = SPIN_LOCK_UNLOCKED; + } + return x; +} + +void __xfrm_state_destroy(struct xfrm_state *x) +{ + BUG_TRAP(x->km.state == XFRM_STATE_DEAD); + + spin_lock_bh(&xfrm_state_gc_lock); + list_add(&x->bydst, &xfrm_state_gc_list); + spin_unlock_bh(&xfrm_state_gc_lock); + schedule_task(&xfrm_state_gc_work); +} + +static void __xfrm_state_delete(struct xfrm_state *x) +{ + if (x->km.state != XFRM_STATE_DEAD) { + x->km.state = XFRM_STATE_DEAD; + spin_lock(&xfrm_state_lock); + list_del(&x->bydst); + atomic_dec(&x->refcnt); + if (x->id.spi) { + list_del(&x->byspi); + atomic_dec(&x->refcnt); + } + spin_unlock(&xfrm_state_lock); + if (del_timer(&x->timer)) + atomic_dec(&x->refcnt); + + /* The number two in this test is the reference + * mentioned in the comment below plus the reference + * our caller holds. A larger value means that + * there are DSTs attached to this xfrm_state. + */ + if (atomic_read(&x->refcnt) > 2) + xfrm_flush_bundles(); + + /* All xfrm_state objects are created by one of two possible + * paths: + * + * 2) xfrm_state_lookup --> xfrm_state_insert + * + * The xfrm_state_lookup or xfrm_state_alloc call gives a + * reference, and that is what we are dropping here. + */ + atomic_dec(&x->refcnt); + } +} + +void xfrm_state_delete(struct xfrm_state *x) +{ + xfrm_state_delete_tunnel(x); + spin_lock_bh(&x->lock); + __xfrm_state_delete(x); + spin_unlock_bh(&x->lock); +} + +void xfrm_state_flush(u8 proto) +{ + int i; + struct xfrm_state *x; + + spin_lock_bh(&xfrm_state_lock); + for (i = 0; i < XFRM_DST_HSIZE; i++) { +restart: + list_for_each_entry(x, xfrm_state_bydst+i, bydst) { + if (!xfrm_state_kern(x) && + (proto == IPSEC_PROTO_ANY || x->id.proto == proto)) { + xfrm_state_hold(x); + spin_unlock_bh(&xfrm_state_lock); + + xfrm_state_delete(x); + xfrm_state_put(x); + + spin_lock_bh(&xfrm_state_lock); + goto restart; + } + } + } + spin_unlock_bh(&xfrm_state_lock); + wake_up(&km_waitq); +} + +static int +xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl, + struct xfrm_tmpl *tmpl, + xfrm_address_t *daddr, xfrm_address_t *saddr, + unsigned short family) +{ + struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); + if (!afinfo) + return -1; + afinfo->init_tempsel(x, fl, tmpl, daddr, saddr); + xfrm_state_put_afinfo(afinfo); + return 0; +} + +struct xfrm_state * +xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, + struct flowi *fl, struct xfrm_tmpl *tmpl, + struct xfrm_policy *pol, int *err, + unsigned short family) +{ + unsigned h = xfrm_dst_hash(daddr, family); + struct xfrm_state *x; + int acquire_in_progress = 0; + int error = 0; + struct xfrm_state *best = NULL; + + spin_lock_bh(&xfrm_state_lock); + list_for_each_entry(x, xfrm_state_bydst+h, bydst) { + if (x->props.family == family && + x->props.reqid == tmpl->reqid && + xfrm_state_addr_check(x, daddr, saddr, family) && + tmpl->mode == x->props.mode && + tmpl->id.proto == x->id.proto) { + /* Resolution logic: + 1. There is a valid state with matching selector. + Done. + 2. Valid state with inappropriate selector. Skip. + + Entering area of "sysdeps". + + 3. If state is not valid, selector is temporary, + it selects only session which triggered + previous resolution. Key manager will do + something to install a state with proper + selector. + */ + if (x->km.state == XFRM_STATE_VALID) { + if (!xfrm_selector_match(&x->sel, fl, family)) + continue; + if (!best || + best->km.dying > x->km.dying || + (best->km.dying == x->km.dying && + best->curlft.add_time < x->curlft.add_time)) + best = x; + } else if (x->km.state == XFRM_STATE_ACQ) { + acquire_in_progress = 1; + } else if (x->km.state == XFRM_STATE_ERROR || + x->km.state == XFRM_STATE_EXPIRED) { + if (xfrm_selector_match(&x->sel, fl, family)) + error = 1; + } + } + } + + if (best) { + xfrm_state_hold(best); + spin_unlock_bh(&xfrm_state_lock); + return best; + } + + x = NULL; + if (!error && !acquire_in_progress && + ((x = xfrm_state_alloc()) != NULL)) { + /* Initialize temporary selector matching only + * to current session. */ + xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family); + + if (km_query(x, tmpl, pol) == 0) { + x->km.state = XFRM_STATE_ACQ; + list_add_tail(&x->bydst, xfrm_state_bydst+h); + xfrm_state_hold(x); + if (x->id.spi) { + h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, family); + list_add(&x->byspi, xfrm_state_byspi+h); + xfrm_state_hold(x); + } + x->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES; + xfrm_state_hold(x); + mod_timer(&x->timer, XFRM_ACQ_EXPIRES*HZ); + } else { + x->km.state = XFRM_STATE_DEAD; + xfrm_state_put(x); + x = NULL; + error = 1; + } + } + spin_unlock_bh(&xfrm_state_lock); + if (!x) + *err = acquire_in_progress ? -EAGAIN : + (error ? -ESRCH : -ENOMEM); + return x; +} + +static void __xfrm_state_insert(struct xfrm_state *x) +{ + unsigned h = xfrm_dst_hash(&x->id.daddr, x->props.family); + + list_add(&x->bydst, xfrm_state_bydst+h); + xfrm_state_hold(x); + + h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family); + + list_add(&x->byspi, xfrm_state_byspi+h); + xfrm_state_hold(x); + + if (!mod_timer(&x->timer, jiffies + HZ)) + xfrm_state_hold(x); + + wake_up(&km_waitq); +} + +void xfrm_state_insert(struct xfrm_state *x) +{ + spin_lock_bh(&xfrm_state_lock); + __xfrm_state_insert(x); + spin_unlock_bh(&xfrm_state_lock); +} + +int xfrm_state_add(struct xfrm_state *x) +{ + struct xfrm_state_afinfo *afinfo; + struct xfrm_state *x1; + int err; + + afinfo = xfrm_state_get_afinfo(x->props.family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + spin_lock_bh(&xfrm_state_lock); + + x1 = afinfo->state_lookup(&x->id.daddr, x->id.spi, x->id.proto); + if (!x1) { + x1 = afinfo->find_acq( + x->props.mode, x->props.reqid, x->id.proto, + &x->id.daddr, &x->props.saddr, 0); + if (x1 && x1->id.spi != x->id.spi && x1->id.spi) { + xfrm_state_put(x1); + x1 = NULL; + } + } + + if (x1 && x1->id.spi) { + xfrm_state_put(x1); + x1 = NULL; + err = -EEXIST; + goto out; + } + + __xfrm_state_insert(x); + err = 0; + +out: + spin_unlock_bh(&xfrm_state_lock); + xfrm_state_put_afinfo(afinfo); + + if (x1) { + xfrm_state_delete(x1); + xfrm_state_put(x1); + } + + return err; +} + +int xfrm_state_update(struct xfrm_state *x) +{ + struct xfrm_state_afinfo *afinfo; + struct xfrm_state *x1; + int err; + + afinfo = xfrm_state_get_afinfo(x->props.family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + spin_lock_bh(&xfrm_state_lock); + x1 = afinfo->state_lookup(&x->id.daddr, x->id.spi, x->id.proto); + + err = -ESRCH; + if (!x1) + goto out; + + if (xfrm_state_kern(x1)) { + xfrm_state_put(x1); + err = -EEXIST; + goto out; + } + + if (x1->km.state == XFRM_STATE_ACQ) { + __xfrm_state_insert(x); + x = NULL; + } + err = 0; + +out: + spin_unlock_bh(&xfrm_state_lock); + xfrm_state_put_afinfo(afinfo); + + if (err) + return err; + + if (!x) { + xfrm_state_delete(x1); + xfrm_state_put(x1); + return 0; + } + + err = -EINVAL; + spin_lock_bh(&x1->lock); + if (likely(x1->km.state == XFRM_STATE_VALID)) { + if (x->encap && x1->encap) + memcpy(x1->encap, x->encap, sizeof(*x1->encap)); + memcpy(&x1->lft, &x->lft, sizeof(x1->lft)); + x1->km.dying = 0; + err = 0; + } + spin_unlock_bh(&x1->lock); + + if (!mod_timer(&x1->timer, jiffies + HZ)) + xfrm_state_hold(x1); + if (x1->curlft.use_time) + xfrm_state_check_expire(x1); + + xfrm_state_put(x1); + + return err; +} + +int xfrm_state_check_expire(struct xfrm_state *x) +{ + if (!x->curlft.use_time) + x->curlft.use_time = (unsigned long)xtime.tv_sec; + + if (x->km.state != XFRM_STATE_VALID) + return -EINVAL; + + if (x->curlft.bytes >= x->lft.hard_byte_limit || + x->curlft.packets >= x->lft.hard_packet_limit) { + km_state_expired(x, 1); + if (!mod_timer(&x->timer, jiffies + XFRM_ACQ_EXPIRES*HZ)) + xfrm_state_hold(x); + return -EINVAL; + } + + if (!x->km.dying && + (x->curlft.bytes >= x->lft.soft_byte_limit || + x->curlft.packets >= x->lft.soft_packet_limit)) + km_state_expired(x, 0); + return 0; +} + +int xfrm_state_check_space(struct xfrm_state *x, struct sk_buff *skb) +{ + int nhead = x->props.header_len + LL_RESERVED_SPACE(skb->dst->dev) + - skb_headroom(skb); + + if (nhead > 0) + return pskb_expand_head(skb, nhead, 0, GFP_ATOMIC); + + /* Check tail too... */ + return 0; +} + +struct xfrm_state * +xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto, + unsigned short family) +{ + struct xfrm_state *x; + struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); + if (!afinfo) + return NULL; + + spin_lock_bh(&xfrm_state_lock); + x = afinfo->state_lookup(daddr, spi, proto); + spin_unlock_bh(&xfrm_state_lock); + xfrm_state_put_afinfo(afinfo); + return x; +} + +struct xfrm_state * +xfrm_find_acq(u8 mode, u32 reqid, u8 proto, + xfrm_address_t *daddr, xfrm_address_t *saddr, + int create, unsigned short family) +{ + struct xfrm_state *x; + struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); + if (!afinfo) + return NULL; + + spin_lock_bh(&xfrm_state_lock); + x = afinfo->find_acq(mode, reqid, proto, daddr, saddr, create); + spin_unlock_bh(&xfrm_state_lock); + xfrm_state_put_afinfo(afinfo); + return x; +} + +/* Silly enough, but I'm lazy to build resolution list */ + +struct xfrm_state * xfrm_find_acq_byseq(u32 seq) +{ + int i; + struct xfrm_state *x; + + spin_lock_bh(&xfrm_state_lock); + for (i = 0; i < XFRM_DST_HSIZE; i++) { + list_for_each_entry(x, xfrm_state_bydst+i, bydst) { + if (x->km.seq == seq) { + xfrm_state_hold(x); + spin_unlock_bh(&xfrm_state_lock); + return x; + } + } + } + spin_unlock_bh(&xfrm_state_lock); + return NULL; +} + +u32 xfrm_get_acqseq(void) +{ + u32 res; + static u32 acqseq; + static spinlock_t acqseq_lock = SPIN_LOCK_UNLOCKED; + + spin_lock_bh(&acqseq_lock); + res = (++acqseq ? : ++acqseq); + spin_unlock_bh(&acqseq_lock); + return res; +} + +void +xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi) +{ + u32 h; + struct xfrm_state *x0; + + if (x->id.spi) + return; + + if (minspi == maxspi) { + x0 = xfrm_state_lookup(&x->id.daddr, minspi, x->id.proto, x->props.family); + if (x0) { + xfrm_state_put(x0); + return; + } + x->id.spi = minspi; + } else { + u32 spi = 0; + minspi = ntohl(minspi); + maxspi = ntohl(maxspi); + for (h=0; hid.daddr, htonl(spi), x->id.proto, x->props.family); + if (x0 == NULL) + break; + xfrm_state_put(x0); + } + x->id.spi = htonl(spi); + } + if (x->id.spi) { + spin_lock_bh(&xfrm_state_lock); + h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family); + list_add(&x->byspi, xfrm_state_byspi+h); + xfrm_state_hold(x); + spin_unlock_bh(&xfrm_state_lock); + wake_up(&km_waitq); + } +} + +int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*), + void *data) +{ + int i; + struct xfrm_state *x; + int count = 0; + int err = 0; + + spin_lock_bh(&xfrm_state_lock); + for (i = 0; i < XFRM_DST_HSIZE; i++) { + list_for_each_entry(x, xfrm_state_bydst+i, bydst) { + if (proto == IPSEC_PROTO_ANY || x->id.proto == proto) + count++; + } + } + if (count == 0) { + err = -ENOENT; + goto out; + } + + for (i = 0; i < XFRM_DST_HSIZE; i++) { + list_for_each_entry(x, xfrm_state_bydst+i, bydst) { + if (proto != IPSEC_PROTO_ANY && x->id.proto != proto) + continue; + err = func(x, --count, data); + if (err) + goto out; + } + } +out: + spin_unlock_bh(&xfrm_state_lock); + return err; +} + + +int xfrm_replay_check(struct xfrm_state *x, u32 seq) +{ + u32 diff; + + seq = ntohl(seq); + + if (unlikely(seq == 0)) + return -EINVAL; + + if (likely(seq > x->replay.seq)) + return 0; + + diff = x->replay.seq - seq; + if (diff >= x->props.replay_window) { + x->stats.replay_window++; + return -EINVAL; + } + + if (x->replay.bitmap & (1U << diff)) { + x->stats.replay++; + return -EINVAL; + } + return 0; +} + +void xfrm_replay_advance(struct xfrm_state *x, u32 seq) +{ + u32 diff; + + seq = ntohl(seq); + + if (seq > x->replay.seq) { + diff = seq - x->replay.seq; + if (diff < x->props.replay_window) + x->replay.bitmap = ((x->replay.bitmap) << diff) | 1; + else + x->replay.bitmap = 1; + x->replay.seq = seq; + } else { + diff = x->replay.seq - seq; + x->replay.bitmap |= (1U << diff); + } +} + +int xfrm_check_selectors(struct xfrm_state **x, int n, struct flowi *fl) +{ + int i; + + for (i=0; isel, fl, x[i]->props.family); + if (!match) + return -EINVAL; + } + return 0; +} + +static struct list_head xfrm_km_list = LIST_HEAD_INIT(xfrm_km_list); +static rwlock_t xfrm_km_lock = RW_LOCK_UNLOCKED; + +void km_state_expired(struct xfrm_state *x, int hard) +{ + struct xfrm_mgr *km; + + if (hard) + x->km.state = XFRM_STATE_EXPIRED; + else + x->km.dying = 1; + + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) + km->notify(x, hard); + read_unlock(&xfrm_km_lock); + + if (hard) + wake_up(&km_waitq); +} + +int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol) +{ + int err = -EINVAL; + struct xfrm_mgr *km; + + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) { + err = km->acquire(x, t, pol, XFRM_POLICY_OUT); + if (!err) + break; + } + read_unlock(&xfrm_km_lock); + return err; +} + +int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport) +{ + int err = -EINVAL; + struct xfrm_mgr *km; + + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) { + if (km->new_mapping) + err = km->new_mapping(x, ipaddr, sport); + if (!err) + break; + } + read_unlock(&xfrm_km_lock); + return err; +} + +void km_policy_expired(struct xfrm_policy *pol, int dir, int hard) +{ + struct xfrm_mgr *km; + + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) + if (km->notify_policy) + km->notify_policy(pol, dir, hard); + read_unlock(&xfrm_km_lock); + + if (hard) + wake_up(&km_waitq); +} + +int xfrm_user_policy(struct sock *sk, int optname, u8 *optval, int optlen) +{ + int err; + u8 *data; + struct xfrm_mgr *km; + struct xfrm_policy *pol = NULL; + + if (optlen <= 0 || optlen > PAGE_SIZE) + return -EMSGSIZE; + + data = kmalloc(optlen, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = -EFAULT; + if (copy_from_user(data, optval, optlen)) + goto out; + + err = -EINVAL; + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) { + pol = km->compile_policy(sk->family, optname, data, optlen, &err); + if (err >= 0) + break; + } + read_unlock(&xfrm_km_lock); + + if (err >= 0) { + xfrm_sk_policy_insert(sk, err, pol); + xfrm_pol_put(pol); + err = 0; + } + +out: + kfree(data); + return err; +} + +int xfrm_register_km(struct xfrm_mgr *km) +{ + write_lock_bh(&xfrm_km_lock); + list_add_tail(&km->list, &xfrm_km_list); + write_unlock_bh(&xfrm_km_lock); + return 0; +} + +int xfrm_unregister_km(struct xfrm_mgr *km) +{ + write_lock_bh(&xfrm_km_lock); + list_del(&km->list); + write_unlock_bh(&xfrm_km_lock); + return 0; +} + +int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) +{ + int err = 0; + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + write_lock(&xfrm_state_afinfo_lock); + if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) + err = -ENOBUFS; + else { + afinfo->state_bydst = xfrm_state_bydst; + afinfo->state_byspi = xfrm_state_byspi; + xfrm_state_afinfo[afinfo->family] = afinfo; + } + write_unlock(&xfrm_state_afinfo_lock); + return err; +} + +int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) +{ + int err = 0; + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + write_lock(&xfrm_state_afinfo_lock); + if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) { + if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo)) + err = -EINVAL; + else { + xfrm_state_afinfo[afinfo->family] = NULL; + afinfo->state_byspi = NULL; + afinfo->state_bydst = NULL; + } + } + write_unlock(&xfrm_state_afinfo_lock); + return err; +} + +struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family) +{ + struct xfrm_state_afinfo *afinfo; + if (unlikely(family >= NPROTO)) + return NULL; + read_lock(&xfrm_state_afinfo_lock); + afinfo = xfrm_state_afinfo[family]; + if (likely(afinfo != NULL)) + read_lock(&afinfo->lock); + read_unlock(&xfrm_state_afinfo_lock); + return afinfo; +} + +void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) +{ + if (unlikely(afinfo == NULL)) + return; + read_unlock(&afinfo->lock); +} + +/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ +void xfrm_state_delete_tunnel(struct xfrm_state *x) +{ + if (x->tunnel) { + struct xfrm_state *t = x->tunnel; + + if (atomic_read(&t->tunnel_users) == 2) + xfrm_state_delete(t); + atomic_dec(&t->tunnel_users); + xfrm_state_put(t); + x->tunnel = NULL; + } +} + +void __init xfrm_state_init(void) +{ + int i; + + for (i=0; i + * IPv6 support + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct sock *xfrm_nl; + +static int verify_one_alg(struct rtattr **xfrma, enum xfrm_attr_type_t type) +{ + struct rtattr *rt = xfrma[type - 1]; + struct xfrm_algo *algp; + + if (!rt) + return 0; + + if ((rt->rta_len - sizeof(*rt)) < sizeof(*algp)) + return -EINVAL; + + algp = RTA_DATA(rt); + switch (type) { + case XFRMA_ALG_AUTH: + if (!algp->alg_key_len && + strcmp(algp->alg_name, "digest_null") != 0) + return -EINVAL; + break; + + case XFRMA_ALG_CRYPT: + if (!algp->alg_key_len && + strcmp(algp->alg_name, "cipher_null") != 0) + return -EINVAL; + break; + + case XFRMA_ALG_COMP: + /* Zero length keys are legal. */ + break; + + default: + return -EINVAL; + }; + + algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0'; + return 0; +} + +static int verify_encap_tmpl(struct rtattr **xfrma) +{ + struct rtattr *rt = xfrma[XFRMA_ENCAP - 1]; + struct xfrm_encap_tmpl *encap; + + if (!rt) + return 0; + + if ((rt->rta_len - sizeof(*rt)) < sizeof(*encap)) + return -EINVAL; + + return 0; +} + +static int verify_newsa_info(struct xfrm_usersa_info *p, + struct rtattr **xfrma) +{ + int err; + + err = -EINVAL; + switch (p->family) { + case AF_INET: + break; + + case AF_INET6: +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + break; +#else + err = -EAFNOSUPPORT; + goto out; +#endif + + default: + goto out; + }; + + err = -EINVAL; + switch (p->id.proto) { + case IPPROTO_AH: + if (!xfrma[XFRMA_ALG_AUTH-1] || + xfrma[XFRMA_ALG_CRYPT-1] || + xfrma[XFRMA_ALG_COMP-1]) + goto out; + break; + + case IPPROTO_ESP: + if ((!xfrma[XFRMA_ALG_AUTH-1] && + !xfrma[XFRMA_ALG_CRYPT-1]) || + xfrma[XFRMA_ALG_COMP-1]) + goto out; + break; + + case IPPROTO_COMP: + if (!xfrma[XFRMA_ALG_COMP-1] || + xfrma[XFRMA_ALG_AUTH-1] || + xfrma[XFRMA_ALG_CRYPT-1]) + goto out; + break; + + default: + goto out; + }; + + if ((err = verify_one_alg(xfrma, XFRMA_ALG_AUTH))) + goto out; + if ((err = verify_one_alg(xfrma, XFRMA_ALG_CRYPT))) + goto out; + if ((err = verify_one_alg(xfrma, XFRMA_ALG_COMP))) + goto out; + if ((err = verify_encap_tmpl(xfrma))) + goto out; + + err = -EINVAL; + switch (p->mode) { + case 0: + case 1: + break; + + default: + goto out; + }; + + err = 0; + +out: + return err; +} + +static int attach_one_algo(struct xfrm_algo **algpp, struct rtattr *u_arg) +{ + struct rtattr *rta = u_arg; + struct xfrm_algo *p, *ualg; + + if (!rta) + return 0; + + ualg = RTA_DATA(rta); + p = kmalloc(sizeof(*ualg) + ualg->alg_key_len, GFP_KERNEL); + if (!p) + return -ENOMEM; + + memcpy(p, ualg, sizeof(*ualg) + ualg->alg_key_len); + *algpp = p; + return 0; +} + +static int attach_encap_tmpl(struct xfrm_encap_tmpl **encapp, struct rtattr *u_arg) +{ + struct rtattr *rta = u_arg; + struct xfrm_encap_tmpl *p, *uencap; + + if (!rta) + return 0; + + uencap = RTA_DATA(rta); + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + memcpy(p, uencap, sizeof(*p)); + *encapp = p; + return 0; +} + +static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) +{ + memcpy(&x->id, &p->id, sizeof(x->id)); + memcpy(&x->sel, &p->sel, sizeof(x->sel)); + memcpy(&x->lft, &p->lft, sizeof(x->lft)); + x->props.mode = p->mode; + x->props.replay_window = p->replay_window; + x->props.reqid = p->reqid; + x->props.family = p->family; + x->props.saddr = p->saddr; + x->props.flags = p->flags; +} + +static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p, + struct rtattr **xfrma, + int *errp) +{ + struct xfrm_state *x = xfrm_state_alloc(); + int err = -ENOMEM; + + if (!x) + goto error_no_put; + + copy_from_user_state(x, p); + + if ((err = attach_one_algo(&x->aalg, xfrma[XFRMA_ALG_AUTH-1]))) + goto error; + if ((err = attach_one_algo(&x->ealg, xfrma[XFRMA_ALG_CRYPT-1]))) + goto error; + if ((err = attach_one_algo(&x->calg, xfrma[XFRMA_ALG_COMP-1]))) + goto error; + if ((err = attach_encap_tmpl(&x->encap, xfrma[XFRMA_ENCAP-1]))) + goto error; + + err = -ENOENT; + x->type = xfrm_get_type(x->id.proto, x->props.family); + if (x->type == NULL) + goto error; + + err = x->type->init_state(x, NULL); + if (err) + goto error; + + x->curlft.add_time = (unsigned long) xtime.tv_sec; + x->km.state = XFRM_STATE_VALID; + x->km.seq = p->seq; + + return x; + +error: + x->km.state = XFRM_STATE_DEAD; + xfrm_state_put(x); +error_no_put: + *errp = err; + return NULL; +} + +static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + struct xfrm_usersa_info *p = NLMSG_DATA(nlh); + struct xfrm_state *x; + int err; + + err = verify_newsa_info(p, (struct rtattr **) xfrma); + if (err) + return err; + + x = xfrm_state_construct(p, (struct rtattr **) xfrma, &err); + if (!x) + return err; + + if (nlh->nlmsg_type == XFRM_MSG_NEWSA) + err = xfrm_state_add(x); + else + err = xfrm_state_update(x); + + if (err < 0) { + x->km.state = XFRM_STATE_DEAD; + xfrm_state_put(x); + } + + return err; +} + +static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + struct xfrm_state *x; + struct xfrm_usersa_id *p = NLMSG_DATA(nlh); + + x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family); + if (x == NULL) + return -ESRCH; + + if (xfrm_state_kern(x)) { + xfrm_state_put(x); + return -EPERM; + } + + xfrm_state_delete(x); + xfrm_state_put(x); + + return 0; +} + +static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) +{ + memcpy(&p->id, &x->id, sizeof(p->id)); + memcpy(&p->sel, &x->sel, sizeof(p->sel)); + memcpy(&p->lft, &x->lft, sizeof(p->lft)); + memcpy(&p->curlft, &x->curlft, sizeof(p->curlft)); + memcpy(&p->stats, &x->stats, sizeof(p->stats)); + p->saddr = x->props.saddr; + p->mode = x->props.mode; + p->replay_window = x->props.replay_window; + p->reqid = x->props.reqid; + p->family = x->props.family; + p->flags = x->props.flags; + p->seq = x->km.seq; +} + +struct xfrm_dump_info { + struct sk_buff *in_skb; + struct sk_buff *out_skb; + u32 nlmsg_seq; + int start_idx; + int this_idx; +}; + +static int dump_one_state(struct xfrm_state *x, int count, void *ptr) +{ + struct xfrm_dump_info *sp = ptr; + struct sk_buff *in_skb = sp->in_skb; + struct sk_buff *skb = sp->out_skb; + struct xfrm_usersa_info *p; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + if (sp->this_idx < sp->start_idx) + goto out; + + nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid, + sp->nlmsg_seq, + XFRM_MSG_NEWSA, sizeof(*p)); + nlh->nlmsg_flags = 0; + + p = NLMSG_DATA(nlh); + copy_to_user_state(x, p); + + if (x->aalg) + RTA_PUT(skb, XFRMA_ALG_AUTH, + sizeof(*(x->aalg))+(x->aalg->alg_key_len+7)/8, x->aalg); + if (x->ealg) + RTA_PUT(skb, XFRMA_ALG_CRYPT, + sizeof(*(x->ealg))+(x->ealg->alg_key_len+7)/8, x->ealg); + if (x->calg) + RTA_PUT(skb, XFRMA_ALG_COMP, sizeof(*(x->calg)), x->calg); + + if (x->encap) + RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap); + + nlh->nlmsg_len = skb->tail - b; +out: + sp->this_idx++; + return 0; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct xfrm_dump_info info; + + info.in_skb = cb->skb; + info.out_skb = skb; + info.nlmsg_seq = cb->nlh->nlmsg_seq; + info.this_idx = 0; + info.start_idx = cb->args[0]; + (void) xfrm_state_walk(IPSEC_PROTO_ANY, dump_one_state, &info); + cb->args[0] = info.this_idx; + + return skb->len; +} + +static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, + struct xfrm_state *x, u32 seq) +{ + struct xfrm_dump_info info; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + + NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; + info.in_skb = in_skb; + info.out_skb = skb; + info.nlmsg_seq = seq; + info.this_idx = info.start_idx = 0; + + if (dump_one_state(x, 0, &info)) { + kfree_skb(skb); + return NULL; + } + + return skb; +} + +static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + struct xfrm_usersa_id *p = NLMSG_DATA(nlh); + struct xfrm_state *x; + struct sk_buff *resp_skb; + int err; + + x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family); + err = -ESRCH; + if (x == NULL) + goto out_noput; + + resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq); + if (IS_ERR(resp_skb)) { + err = PTR_ERR(resp_skb); + } else { + err = netlink_unicast(xfrm_nl, resp_skb, + NETLINK_CB(skb).pid, MSG_DONTWAIT); + } + xfrm_state_put(x); +out_noput: + return err; +} + +static int verify_userspi_info(struct xfrm_userspi_info *p) +{ + switch (p->info.id.proto) { + case IPPROTO_AH: + case IPPROTO_ESP: + break; + + case IPPROTO_COMP: + /* IPCOMP spi is 16-bits. */ + if (p->max >= 0x10000) + return -EINVAL; + break; + + default: + return -EINVAL; + }; + + if (p->min > p->max) + return -EINVAL; + + return 0; +} + +static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + struct xfrm_state *x; + struct xfrm_userspi_info *p; + struct sk_buff *resp_skb; + int err; + + p = NLMSG_DATA(nlh); + err = verify_userspi_info(p); + if (err) + goto out_noput; + x = xfrm_find_acq(p->info.mode, p->info.reqid, p->info.id.proto, + &p->info.id.daddr, + &p->info.saddr, 1, + p->info.family); + err = -ENOENT; + if (x == NULL) + goto out_noput; + + resp_skb = ERR_PTR(-ENOENT); + + spin_lock_bh(&x->lock); + if (x->km.state != XFRM_STATE_DEAD) { + xfrm_alloc_spi(x, htonl(p->min), htonl(p->max)); + if (x->id.spi) + resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq); + } + spin_unlock_bh(&x->lock); + + if (IS_ERR(resp_skb)) { + err = PTR_ERR(resp_skb); + goto out; + } + + err = netlink_unicast(xfrm_nl, resp_skb, + NETLINK_CB(skb).pid, MSG_DONTWAIT); + +out: + xfrm_state_put(x); +out_noput: + return err; +} + +static int verify_policy_dir(__u8 dir) +{ + switch (dir) { + case XFRM_POLICY_IN: + case XFRM_POLICY_OUT: + case XFRM_POLICY_FWD: + break; + + default: + return -EINVAL; + }; + + return 0; +} + +static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) +{ + switch (p->share) { + case XFRM_SHARE_ANY: + case XFRM_SHARE_SESSION: + case XFRM_SHARE_USER: + case XFRM_SHARE_UNIQUE: + break; + + default: + return -EINVAL; + }; + + switch (p->action) { + case XFRM_POLICY_ALLOW: + case XFRM_POLICY_BLOCK: + break; + + default: + return -EINVAL; + }; + + switch (p->sel.family) { + case AF_INET: + break; + + case AF_INET6: +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + break; +#else + return -EAFNOSUPPORT; +#endif + + default: + return -EINVAL; + }; + + return verify_policy_dir(p->dir); +} + +static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, + int nr) +{ + int i; + + xp->xfrm_nr = nr; + for (i = 0; i < nr; i++, ut++) { + struct xfrm_tmpl *t = &xp->xfrm_vec[i]; + + memcpy(&t->id, &ut->id, sizeof(struct xfrm_id)); + memcpy(&t->saddr, &ut->saddr, + sizeof(xfrm_address_t)); + t->reqid = ut->reqid; + t->mode = ut->mode; + t->share = ut->share; + t->optional = ut->optional; + t->aalgos = ut->aalgos; + t->ealgos = ut->ealgos; + t->calgos = ut->calgos; + } +} + +static int copy_from_user_tmpl(struct xfrm_policy *pol, struct rtattr **xfrma) +{ + struct rtattr *rt = xfrma[XFRMA_TMPL-1]; + struct xfrm_user_tmpl *utmpl; + int nr; + + if (!rt) { + pol->xfrm_nr = 0; + } else { + nr = (rt->rta_len - sizeof(*rt)) / sizeof(*utmpl); + + if (nr > XFRM_MAX_DEPTH) + return -EINVAL; + + copy_templates(pol, RTA_DATA(rt), nr); + } + return 0; +} + +static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p) +{ + xp->priority = p->priority; + xp->index = p->index; + memcpy(&xp->selector, &p->sel, sizeof(xp->selector)); + memcpy(&xp->lft, &p->lft, sizeof(xp->lft)); + xp->action = p->action; + xp->flags = p->flags; + xp->family = p->sel.family; + /* XXX xp->share = p->share; */ +} + +static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p, int dir) +{ + memcpy(&p->sel, &xp->selector, sizeof(p->sel)); + memcpy(&p->lft, &xp->lft, sizeof(p->lft)); + memcpy(&p->curlft, &xp->curlft, sizeof(p->curlft)); + p->priority = xp->priority; + p->index = xp->index; + p->sel.family = xp->family; + p->dir = dir; + p->action = xp->action; + p->flags = xp->flags; + p->share = XFRM_SHARE_ANY; /* XXX xp->share */ +} + +static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p, struct rtattr **xfrma, int *errp) +{ + struct xfrm_policy *xp = xfrm_policy_alloc(GFP_KERNEL); + int err; + + if (!xp) { + *errp = -ENOMEM; + return NULL; + } + + copy_from_user_policy(xp, p); + err = copy_from_user_tmpl(xp, xfrma); + if (err) { + *errp = err; + kfree(xp); + xp = NULL; + } + + return xp; +} + +static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + struct xfrm_userpolicy_info *p = NLMSG_DATA(nlh); + struct xfrm_policy *xp; + int err; + int excl; + + err = verify_newpolicy_info(p); + if (err) + return err; + + xp = xfrm_policy_construct(p, (struct rtattr **) xfrma, &err); + if (!xp) + return err; + + excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY; + err = xfrm_policy_insert(p->dir, xp, excl); + if (err) { + kfree(xp); + return err; + } + + xfrm_pol_put(xp); + + return 0; +} + +static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb) +{ + struct xfrm_user_tmpl vec[XFRM_MAX_DEPTH]; + int i; + + if (xp->xfrm_nr == 0) + return 0; + + for (i = 0; i < xp->xfrm_nr; i++) { + struct xfrm_user_tmpl *up = &vec[i]; + struct xfrm_tmpl *kp = &xp->xfrm_vec[i]; + + memcpy(&up->id, &kp->id, sizeof(up->id)); + up->family = xp->family; + memcpy(&up->saddr, &kp->saddr, sizeof(up->saddr)); + up->reqid = kp->reqid; + up->mode = kp->mode; + up->share = kp->share; + up->optional = kp->optional; + up->aalgos = kp->aalgos; + up->ealgos = kp->ealgos; + up->calgos = kp->calgos; + } + RTA_PUT(skb, XFRMA_TMPL, + (sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr), + vec); + + return 0; + +rtattr_failure: + return -1; +} + +static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr) +{ + struct xfrm_dump_info *sp = ptr; + struct xfrm_userpolicy_info *p; + struct sk_buff *in_skb = sp->in_skb; + struct sk_buff *skb = sp->out_skb; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + if (sp->this_idx < sp->start_idx) + goto out; + + nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid, + sp->nlmsg_seq, + XFRM_MSG_NEWPOLICY, sizeof(*p)); + p = NLMSG_DATA(nlh); + nlh->nlmsg_flags = 0; + + copy_to_user_policy(xp, p, dir); + if (copy_to_user_tmpl(xp, skb) < 0) + goto nlmsg_failure; + + nlh->nlmsg_len = skb->tail - b; +out: + sp->this_idx++; + return 0; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int xfrm_dump_policy(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct xfrm_dump_info info; + + info.in_skb = cb->skb; + info.out_skb = skb; + info.nlmsg_seq = cb->nlh->nlmsg_seq; + info.this_idx = 0; + info.start_idx = cb->args[0]; + (void) xfrm_policy_walk(dump_one_policy, &info); + cb->args[0] = info.this_idx; + + return skb->len; +} + +static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, + struct xfrm_policy *xp, + int dir, u32 seq) +{ + struct xfrm_dump_info info; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); + + NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; + info.in_skb = in_skb; + info.out_skb = skb; + info.nlmsg_seq = seq; + info.this_idx = info.start_idx = 0; + + if (dump_one_policy(xp, dir, 0, &info) < 0) { + kfree_skb(skb); + return NULL; + } + + return skb; +} + +static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + struct xfrm_policy *xp; + struct xfrm_userpolicy_id *p; + int err; + int delete; + + p = NLMSG_DATA(nlh); + delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY; + + err = verify_policy_dir(p->dir); + if (err) + return err; + + if (p->index) + xp = xfrm_policy_byid(p->dir, p->index, delete); + else + xp = xfrm_policy_bysel(p->dir, &p->sel, delete); + if (xp == NULL) + return -ENOENT; + + if (!delete) { + struct sk_buff *resp_skb; + + resp_skb = xfrm_policy_netlink(skb, xp, p->dir, nlh->nlmsg_seq); + if (IS_ERR(resp_skb)) { + err = PTR_ERR(resp_skb); + } else { + err = netlink_unicast(xfrm_nl, resp_skb, + NETLINK_CB(skb).pid, + MSG_DONTWAIT); + } + } + + xfrm_pol_put(xp); + + return err; +} + +static const int xfrm_msg_min[(XFRM_MSG_MAX + 1 - XFRM_MSG_BASE)] = { + NLMSG_LENGTH(sizeof(struct xfrm_usersa_info)), /* NEW SA */ + NLMSG_LENGTH(sizeof(struct xfrm_usersa_id)), /* DEL SA */ + NLMSG_LENGTH(sizeof(struct xfrm_usersa_id)), /* GET SA */ + NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_info)),/* NEW POLICY */ + NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id)), /* DEL POLICY */ + NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id)), /* GET POLICY */ + NLMSG_LENGTH(sizeof(struct xfrm_userspi_info)), /* ALLOC SPI */ + NLMSG_LENGTH(sizeof(struct xfrm_user_acquire)), /* ACQUIRE */ + NLMSG_LENGTH(sizeof(struct xfrm_user_expire)), /* EXPIRE */ + NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_info)),/* UPD POLICY */ + NLMSG_LENGTH(sizeof(struct xfrm_usersa_info)), /* UPD SA */ +}; + +static struct xfrm_link { + int (*doit)(struct sk_buff *, struct nlmsghdr *, void **); + int (*dump)(struct sk_buff *, struct netlink_callback *); +} xfrm_dispatch[] = { + { .doit = xfrm_add_sa, }, + { .doit = xfrm_del_sa, }, + { + .doit = xfrm_get_sa, + .dump = xfrm_dump_sa, + }, + { .doit = xfrm_add_policy }, + { .doit = xfrm_get_policy }, + { + .doit = xfrm_get_policy, + .dump = xfrm_dump_policy, + }, + { .doit = xfrm_alloc_userspi }, + {}, + {}, + { .doit = xfrm_add_policy }, + { .doit = xfrm_add_sa, }, +}; + +static int xfrm_done(struct netlink_callback *cb) +{ + return 0; +} + +static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) +{ + struct rtattr *xfrma[XFRMA_MAX]; + struct xfrm_link *link; + int type, min_len; + + if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) + return 0; + + type = nlh->nlmsg_type; + + /* A control message: ignore them */ + if (type < XFRM_MSG_BASE) + return 0; + + /* Unknown message: reply with EINVAL */ + if (type > XFRM_MSG_MAX) + goto err_einval; + + type -= XFRM_MSG_BASE; + link = &xfrm_dispatch[type]; + + /* All operations require privileges, even GET */ + if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) { + *errp = -EPERM; + return -1; + } + + if ((type == 2 || type == 5) && (nlh->nlmsg_flags & NLM_F_DUMP)) { + u32 rlen; + + if (link->dump == NULL) + goto err_einval; + + if ((*errp = netlink_dump_start(xfrm_nl, skb, nlh, + link->dump, + xfrm_done)) != 0) { + return -1; + } + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return -1; + } + + memset(xfrma, 0, sizeof(xfrma)); + + if (nlh->nlmsg_len < (min_len = xfrm_msg_min[type])) + goto err_einval; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *attr = (void *) nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(attr, attrlen)) { + unsigned short flavor = attr->rta_type; + if (flavor) { + if (flavor > XFRMA_MAX) + goto err_einval; + xfrma[flavor - 1] = attr; + } + attr = RTA_NEXT(attr, attrlen); + } + } + + if (link->doit == NULL) + goto err_einval; + *errp = link->doit(skb, nlh, (void **) &xfrma); + + return *errp; + +err_einval: + *errp = -EINVAL; + return -1; +} + +static int xfrm_user_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr *nlh; + + while (skb->len >= NLMSG_SPACE(0)) { + u32 rlen; + + nlh = (struct nlmsghdr *) skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || + skb->len < nlh->nlmsg_len) + return 0; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + if (xfrm_user_rcv_msg(skb, nlh, &err) < 0) { + if (err == 0) + return -1; + netlink_ack(skb, nlh, err); + } else if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } + + return 0; +} + +static void xfrm_netlink_rcv(struct sock *sk, int len) +{ + do { + struct sk_buff *skb; + + down(&xfrm_cfg_sem); + + while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { + if (xfrm_user_rcv_skb(skb)) { + if (skb->len) + skb_queue_head(&sk->receive_queue, skb); + else + kfree_skb(skb); + break; + } + kfree_skb(skb); + } + + up(&xfrm_cfg_sem); + + } while (xfrm_nl && xfrm_nl->receive_queue.qlen); +} + +static int build_expire(struct sk_buff *skb, struct xfrm_state *x, int hard) +{ + struct xfrm_user_expire *ue; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, 0, 0, XFRM_MSG_EXPIRE, + sizeof(*ue)); + ue = NLMSG_DATA(nlh); + nlh->nlmsg_flags = 0; + + copy_to_user_state(x, &ue->state); + ue->hard = (hard != 0) ? 1 : 0; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int xfrm_send_state_notify(struct xfrm_state *x, int hard) +{ + struct sk_buff *skb; + + skb = alloc_skb(sizeof(struct xfrm_user_expire) + 16, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + if (build_expire(skb, x, hard) < 0) + BUG(); + + NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; + + return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC); +} + +static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, + struct xfrm_tmpl *xt, struct xfrm_policy *xp, + int dir) +{ + struct xfrm_user_acquire *ua; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + __u32 seq = xfrm_get_acqseq(); + + nlh = NLMSG_PUT(skb, 0, 0, XFRM_MSG_ACQUIRE, + sizeof(*ua)); + ua = NLMSG_DATA(nlh); + nlh->nlmsg_flags = 0; + + memcpy(&ua->id, &x->id, sizeof(ua->id)); + memcpy(&ua->saddr, &x->props.saddr, sizeof(ua->saddr)); + memcpy(&ua->sel, &x->sel, sizeof(ua->sel)); + copy_to_user_policy(xp, &ua->policy, dir); + ua->aalgos = xt->aalgos; + ua->ealgos = xt->ealgos; + ua->calgos = xt->calgos; + ua->seq = x->km.seq = seq; + + if (copy_to_user_tmpl(xp, skb) < 0) + goto nlmsg_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt, + struct xfrm_policy *xp, int dir) +{ + struct sk_buff *skb; + size_t len; + + len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); + len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire)); + skb = alloc_skb(len, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + if (build_acquire(skb, x, xt, xp, dir) < 0) + BUG(); + + NETLINK_CB(skb).dst_groups = XFRMGRP_ACQUIRE; + + return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_ACQUIRE, GFP_ATOMIC); +} + +/* User gives us xfrm_user_policy_info followed by an array of 0 + * or more templates. + */ +struct xfrm_policy *xfrm_compile_policy(u16 family, int opt, + u8 *data, int len, int *dir) +{ + struct xfrm_userpolicy_info *p = (struct xfrm_userpolicy_info *)data; + struct xfrm_user_tmpl *ut = (struct xfrm_user_tmpl *) (p + 1); + struct xfrm_policy *xp; + int nr; + + switch (family) { + case AF_INET: + if (opt != IP_XFRM_POLICY) { + *dir = -EOPNOTSUPP; + return NULL; + } + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + if (opt != IPV6_XFRM_POLICY) { + *dir = -EOPNOTSUPP; + return NULL; + } + break; +#endif + default: + *dir = -EINVAL; + return NULL; + } + + *dir = -EINVAL; + + if (len < sizeof(*p) || + verify_newpolicy_info(p)) + return NULL; + + nr = ((len - sizeof(*p)) / sizeof(*ut)); + if (nr > XFRM_MAX_DEPTH) + return NULL; + + xp = xfrm_policy_alloc(GFP_KERNEL); + if (xp == NULL) { + *dir = -ENOBUFS; + return NULL; + } + + copy_from_user_policy(xp, p); + copy_templates(xp, ut, nr); + + *dir = p->dir; + + return xp; +} + +static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, + int dir, int hard) +{ + struct xfrm_user_polexpire *upe; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, 0, 0, XFRM_MSG_POLEXPIRE, sizeof(*upe)); + upe = NLMSG_DATA(nlh); + nlh->nlmsg_flags = 0; + + copy_to_user_policy(xp, &upe->pol, dir); + if (copy_to_user_tmpl(xp, skb) < 0) + goto nlmsg_failure; + upe->hard = !!hard; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard) +{ + struct sk_buff *skb; + size_t len; + + len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); + len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire)); + skb = alloc_skb(len, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + if (build_polexpire(skb, xp, dir, hard) < 0) + BUG(); + + NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; + + return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC); +} + +static struct xfrm_mgr netlink_mgr = { + .id = "netlink", + .notify = xfrm_send_state_notify, + .acquire = xfrm_send_acquire, + .compile_policy = xfrm_compile_policy, + .notify_policy = xfrm_send_policy_notify, +}; + +static int __init xfrm_user_init(void) +{ + printk(KERN_INFO "Initializing IPsec netlink socket\n"); + + xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv); + if (xfrm_nl == NULL) + panic("xfrm_user_init: cannot initialize xfrm_nl\n"); + + + xfrm_register_km(&netlink_mgr); + + return 0; +} + +static void __exit xfrm_user_exit(void) +{ + xfrm_unregister_km(&netlink_mgr); + sock_release(xfrm_nl->socket); +} + +module_init(xfrm_user_init); +module_exit(xfrm_user_exit); +MODULE_LICENSE("GPL"); Index: scripts/tkgen.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/scripts/tkgen.c,v retrieving revision 1.1.1.15 retrieving revision 1.1.1.15.2.1 diff -u -r1.1.1.15 -r1.1.1.15.2.1 --- a/scripts/tkgen.c 3 Aug 2002 00:39:46 -0000 1.1.1.15 +++ b/scripts/tkgen.c 16 Apr 2004 13:16:27 -0000 1.1.1.15.2.1 @@ -546,7 +546,7 @@ printf( "set %s [expr $%s&15]", vartable[cfg->nameindex].name, vartable[cfg->nameindex].name ); printf( "} else {"); - printf( "set %s [expr $%s|16]}\n", + printf( "set %s [expr $%s]}\n", vartable[cfg->nameindex].name, vartable[cfg->nameindex].name ); break; @@ -612,7 +612,7 @@ /* * Clear the disable bit to enable the correct radiobutton. */ - printf( "set %s [expr $%s|16]}\n", + printf( "set %s [expr $%s]}\n", vartable[cfg->nameindex].name, vartable[cfg->nameindex].name ); break;