Properly parse URI

Use URI parse code based on net/uri to validate hostnames.
author: Erik Dubbelboer <erik@dubbelboer.com> 2021-10-01 13:33:42 +0200
committer: Erik Dubbelboer <erik@dubbelboer.com> 2021-10-01 13:38:31 +0200
commit: 542a203b42a3cb722244e1ef74fda0804a926383 (patch)
tree: 5690b21323c3ae6a779603e4ac3a21c1bada9a85 /uri.go
parent: feat: improve TCPDialer by `sync.map` instead of `map+mutex` (#1106) (diff)
download: fasthttp-542a203b42a3cb722244e1ef74fda0804a926383.tar.gz
fasthttp-542a203b42a3cb722244e1ef74fda0804a926383.tar.bz2
fasthttp-542a203b42a3cb722244e1ef74fda0804a926383.zip
1 files changed, 221 insertions, 0 deletions
diff --git a/uri.go b/uri.go
index b57b1f7..a2ca6a5 100644
--- a/uri.go
+++ b/uri.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"strconv"
 	"sync"
 )
 
@@ -298,6 +299,10 @@ func (u *URI) parse(host, uri []byte, isTLS bool) error {
 		}
 	}
 
+	host, err := parseHost(host)
+	if err != nil {
+		return err
+	}
 	u.host = append(u.host, host...)
 	lowercaseBytes(u.host)
 
@@ -338,6 +343,222 @@ func (u *URI) parse(host, uri []byte, isTLS bool) error {
 	return nil
 }
 
+// parseHost parses host as an authority without user
+// information. That is, as host[:port].
+//
+// Based on https://github.com/golang/go/blob/8ac5cbe05d61df0a7a7c9a38ff33305d4dcfea32/src/net/url/url.go#L619
+func parseHost(host []byte) ([]byte, error) {
+	if len(host) > 0 && host[0] == '[' {
+		// Parse an IP-Literal in RFC 3986 and RFC 6874.
+		// E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
+		i := bytes.LastIndexByte(host, ']')
+		if i < 0 {
+			return nil, errors.New("missing ']' in host")
+		}
+		colonPort := host[i+1:]
+		if !validOptionalPort(colonPort) {
+			return nil, fmt.Errorf("invalid port %q after host", colonPort)
+		}
+
+		// RFC 6874 defines that %25 (%-encoded percent) introduces
+		// the zone identifier, and the zone identifier can use basically
+		// any %-encoding it likes. That's different from the host, which
+		// can only %-encode non-ASCII bytes.
+		// We do impose some restrictions on the zone, to avoid stupidity
+		// like newlines.
+		zone := bytes.Index(host[:i], []byte("%25"))
+		if zone >= 0 {
+			host1, err := unescape(host[:zone], encodeHost)
+			if err != nil {
+				return nil, err
+			}
+			host2, err := unescape(host[zone:i], encodeZone)
+			if err != nil {
+				return nil, err
+			}
+			host3, err := unescape(host[i:], encodeHost)
+			if err != nil {
+				return nil, err
+			}
+			return append(host1, append(host2, host3...)...), nil
+		}
+	} else if i := bytes.LastIndexByte(host, ':'); i != -1 {
+		colonPort := host[i:]
+		if !validOptionalPort(colonPort) {
+			return nil, fmt.Errorf("invalid port %q after host", colonPort)
+		}
+	}
+
+	var err error
+	if host, err = unescape(host, encodeHost); err != nil {
+		return nil, err
+	}
+	return host, nil
+}
+
+type encoding int
+
+const (
+	encodeHost encoding = 1 + iota
+	encodeZone
+)
+
+type EscapeError string
+
+func (e EscapeError) Error() string {
+	return "invalid URL escape " + strconv.Quote(string(e))
+}
+
+type InvalidHostError string
+
+func (e InvalidHostError) Error() string {
+	return "invalid character " + strconv.Quote(string(e)) + " in host name"
+}
+
+// unescape unescapes a string; the mode specifies
+// which section of the URL string is being unescaped.
+//
+// Based on https://github.com/golang/go/blob/8ac5cbe05d61df0a7a7c9a38ff33305d4dcfea32/src/net/url/url.go#L199
+func unescape(s []byte, mode encoding) ([]byte, error) {
+	// Count %, check that they're well-formed.
+	n := 0
+	for i := 0; i < len(s); {
+		switch s[i] {
+		case '%':
+			n++
+			if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
+				s = s[i:]
+				if len(s) > 3 {
+					s = s[:3]
+				}
+				return nil, EscapeError(s)
+			}
+			// Per https://tools.ietf.org/html/rfc3986#page-21
+			// in the host component %-encoding can only be used
+			// for non-ASCII bytes.
+			// But https://tools.ietf.org/html/rfc6874#section-2
+			// introduces %25 being allowed to escape a percent sign
+			// in IPv6 scoped-address literals. Yay.
+			if mode == encodeHost && unhex(s[i+1]) < 8 && !bytes.Equal(s[i:i+3], []byte("%25")) {
+				return nil, EscapeError(s[i : i+3])
+			}
+			if mode == encodeZone {
+				// RFC 6874 says basically "anything goes" for zone identifiers
+				// and that even non-ASCII can be redundantly escaped,
+				// but it seems prudent to restrict %-escaped bytes here to those
+				// that are valid host name bytes in their unescaped form.
+				// That is, you can use escaping in the zone identifier but not
+				// to introduce bytes you couldn't just write directly.
+				// But Windows puts spaces here! Yay.
+				v := unhex(s[i+1])<<4 | unhex(s[i+2])
+				if !bytes.Equal(s[i:i+3], []byte("%25")) && v != ' ' && shouldEscape(v, encodeHost) {
+					return nil, EscapeError(s[i : i+3])
+				}
+			}
+			i += 3
+		default:
+			if (mode == encodeHost || mode == encodeZone) && s[i] < 0x80 && shouldEscape(s[i], mode) {
+				return nil, InvalidHostError(s[i : i+1])
+			}
+			i++
+		}
+	}
+
+	if n == 0 {
+		return s, nil
+	}
+
+	t := s[:0]
+	for i := 0; i < len(s); i++ {
+		switch s[i] {
+		case '%':
+			t = append(t, unhex(s[i+1])<<4|unhex(s[i+2]))
+			i += 2
+		default:
+			t = append(t, s[i])
+		}
+	}
+	return t, nil
+}
+
+// Return true if the specified character should be escaped when
+// appearing in a URL string, according to RFC 3986.
+//
+// Please be informed that for now shouldEscape does not check all
+// reserved characters correctly. See golang.org/issue/5684.
+//
+// Based on https://github.com/golang/go/blob/8ac5cbe05d61df0a7a7c9a38ff33305d4dcfea32/src/net/url/url.go#L100
+func shouldEscape(c byte, mode encoding) bool {
+	// §2.3 Unreserved characters (alphanum)
+	if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
+		return false
+	}
+
+	if mode == encodeHost || mode == encodeZone {
+		// §3.2.2 Host allows
+		//	sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
+		// as part of reg-name.
+		// We add : because we include :port as part of host.
+		// We add [ ] because we include [ipv6]:port as part of host.
+		// We add < > because they're the only characters left that
+		// we could possibly allow, and Parse will reject them if we
+		// escape them (because hosts can't use %-encoding for
+		// ASCII bytes).
+		switch c {
+		case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
+			return false
+		}
+	}
+
+	if c == '-' || c == '_' || c == '.' || c == '~' { // §2.3 Unreserved characters (mark)
+		return false
+	}
+
+	// Everything else must be escaped.
+	return true
+}
+
+func ishex(c byte) bool {
+	switch {
+	case '0' <= c && c <= '9':
+		return true
+	case 'a' <= c && c <= 'f':
+		return true
+	case 'A' <= c && c <= 'F':
+		return true
+	}
+	return false
+}
+
+func unhex(c byte) byte {
+	switch {
+	case '0' <= c && c <= '9':
+		return c - '0'
+	case 'a' <= c && c <= 'f':
+		return c - 'a' + 10
+	case 'A' <= c && c <= 'F':
+		return c - 'A' + 10
+	}
+	return 0
+}
+
+// validOptionalPort reports whether port is either an empty string
+// or matches /^:\d*$/
+func validOptionalPort(port []byte) bool {
+	if len(port) == 0 {
+		return true
+	}
+	if port[0] != ':' {
+		return false
+	}
+	for _, b := range port[1:] {
+		if b < '0' || b > '9' {
+			return false
+		}
+	}
+	return true
+}
+
 func normalizePath(dst, src []byte) []byte {
 	dst = dst[:0]
 	dst = addLeadingSlash(dst, src)
author	Erik Dubbelboer <erik@dubbelboer.com>	2021-10-01 13:33:42 +0200
committer	Erik Dubbelboer <erik@dubbelboer.com>	2021-10-01 13:38:31 +0200
commit	542a203b42a3cb722244e1ef74fda0804a926383 (patch)
tree	5690b21323c3ae6a779603e4ac3a21c1bada9a85 /uri.go
parent	feat: improve TCPDialer by `sync.map` instead of `map+mutex` (#1106) (diff)
download	fasthttp-542a203b42a3cb722244e1ef74fda0804a926383.tar.gz fasthttp-542a203b42a3cb722244e1ef74fda0804a926383.tar.bz2 fasthttp-542a203b42a3cb722244e1ef74fda0804a926383.zip