diff options
author | Erik Dubbelboer <erik@dubbelboer.com> | 2021-10-01 13:33:42 +0200 |
---|---|---|
committer | Erik Dubbelboer <erik@dubbelboer.com> | 2021-10-01 13:38:31 +0200 |
commit | 542a203b42a3cb722244e1ef74fda0804a926383 (patch) | |
tree | 5690b21323c3ae6a779603e4ac3a21c1bada9a85 /uri.go | |
parent | feat: improve TCPDialer by `sync.map` instead of `map+mutex` (#1106) (diff) | |
download | fasthttp-542a203b42a3cb722244e1ef74fda0804a926383.tar.gz fasthttp-542a203b42a3cb722244e1ef74fda0804a926383.tar.bz2 fasthttp-542a203b42a3cb722244e1ef74fda0804a926383.zip |
Properly parse URI
Use URI parse code based on net/uri to validate hostnames.
Diffstat (limited to 'uri.go')
-rw-r--r-- | uri.go | 221 |
1 files changed, 221 insertions, 0 deletions
@@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "strconv" "sync" ) @@ -298,6 +299,10 @@ func (u *URI) parse(host, uri []byte, isTLS bool) error { } } + host, err := parseHost(host) + if err != nil { + return err + } u.host = append(u.host, host...) lowercaseBytes(u.host) @@ -338,6 +343,222 @@ func (u *URI) parse(host, uri []byte, isTLS bool) error { return nil } +// parseHost parses host as an authority without user +// information. That is, as host[:port]. +// +// Based on https://github.com/golang/go/blob/8ac5cbe05d61df0a7a7c9a38ff33305d4dcfea32/src/net/url/url.go#L619 +func parseHost(host []byte) ([]byte, error) { + if len(host) > 0 && host[0] == '[' { + // Parse an IP-Literal in RFC 3986 and RFC 6874. + // E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80". + i := bytes.LastIndexByte(host, ']') + if i < 0 { + return nil, errors.New("missing ']' in host") + } + colonPort := host[i+1:] + if !validOptionalPort(colonPort) { + return nil, fmt.Errorf("invalid port %q after host", colonPort) + } + + // RFC 6874 defines that %25 (%-encoded percent) introduces + // the zone identifier, and the zone identifier can use basically + // any %-encoding it likes. That's different from the host, which + // can only %-encode non-ASCII bytes. + // We do impose some restrictions on the zone, to avoid stupidity + // like newlines. + zone := bytes.Index(host[:i], []byte("%25")) + if zone >= 0 { + host1, err := unescape(host[:zone], encodeHost) + if err != nil { + return nil, err + } + host2, err := unescape(host[zone:i], encodeZone) + if err != nil { + return nil, err + } + host3, err := unescape(host[i:], encodeHost) + if err != nil { + return nil, err + } + return append(host1, append(host2, host3...)...), nil + } + } else if i := bytes.LastIndexByte(host, ':'); i != -1 { + colonPort := host[i:] + if !validOptionalPort(colonPort) { + return nil, fmt.Errorf("invalid port %q after host", colonPort) + } + } + + var err error + if host, err = unescape(host, encodeHost); err != nil { + return nil, err + } + return host, nil +} + +type encoding int + +const ( + encodeHost encoding = 1 + iota + encodeZone +) + +type EscapeError string + +func (e EscapeError) Error() string { + return "invalid URL escape " + strconv.Quote(string(e)) +} + +type InvalidHostError string + +func (e InvalidHostError) Error() string { + return "invalid character " + strconv.Quote(string(e)) + " in host name" +} + +// unescape unescapes a string; the mode specifies +// which section of the URL string is being unescaped. +// +// Based on https://github.com/golang/go/blob/8ac5cbe05d61df0a7a7c9a38ff33305d4dcfea32/src/net/url/url.go#L199 +func unescape(s []byte, mode encoding) ([]byte, error) { + // Count %, check that they're well-formed. + n := 0 + for i := 0; i < len(s); { + switch s[i] { + case '%': + n++ + if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) { + s = s[i:] + if len(s) > 3 { + s = s[:3] + } + return nil, EscapeError(s) + } + // Per https://tools.ietf.org/html/rfc3986#page-21 + // in the host component %-encoding can only be used + // for non-ASCII bytes. + // But https://tools.ietf.org/html/rfc6874#section-2 + // introduces %25 being allowed to escape a percent sign + // in IPv6 scoped-address literals. Yay. + if mode == encodeHost && unhex(s[i+1]) < 8 && !bytes.Equal(s[i:i+3], []byte("%25")) { + return nil, EscapeError(s[i : i+3]) + } + if mode == encodeZone { + // RFC 6874 says basically "anything goes" for zone identifiers + // and that even non-ASCII can be redundantly escaped, + // but it seems prudent to restrict %-escaped bytes here to those + // that are valid host name bytes in their unescaped form. + // That is, you can use escaping in the zone identifier but not + // to introduce bytes you couldn't just write directly. + // But Windows puts spaces here! Yay. + v := unhex(s[i+1])<<4 | unhex(s[i+2]) + if !bytes.Equal(s[i:i+3], []byte("%25")) && v != ' ' && shouldEscape(v, encodeHost) { + return nil, EscapeError(s[i : i+3]) + } + } + i += 3 + default: + if (mode == encodeHost || mode == encodeZone) && s[i] < 0x80 && shouldEscape(s[i], mode) { + return nil, InvalidHostError(s[i : i+1]) + } + i++ + } + } + + if n == 0 { + return s, nil + } + + t := s[:0] + for i := 0; i < len(s); i++ { + switch s[i] { + case '%': + t = append(t, unhex(s[i+1])<<4|unhex(s[i+2])) + i += 2 + default: + t = append(t, s[i]) + } + } + return t, nil +} + +// Return true if the specified character should be escaped when +// appearing in a URL string, according to RFC 3986. +// +// Please be informed that for now shouldEscape does not check all +// reserved characters correctly. See golang.org/issue/5684. +// +// Based on https://github.com/golang/go/blob/8ac5cbe05d61df0a7a7c9a38ff33305d4dcfea32/src/net/url/url.go#L100 +func shouldEscape(c byte, mode encoding) bool { + // §2.3 Unreserved characters (alphanum) + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { + return false + } + + if mode == encodeHost || mode == encodeZone { + // §3.2.2 Host allows + // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" + // as part of reg-name. + // We add : because we include :port as part of host. + // We add [ ] because we include [ipv6]:port as part of host. + // We add < > because they're the only characters left that + // we could possibly allow, and Parse will reject them if we + // escape them (because hosts can't use %-encoding for + // ASCII bytes). + switch c { + case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"': + return false + } + } + + if c == '-' || c == '_' || c == '.' || c == '~' { // §2.3 Unreserved characters (mark) + return false + } + + // Everything else must be escaped. + return true +} + +func ishex(c byte) bool { + switch { + case '0' <= c && c <= '9': + return true + case 'a' <= c && c <= 'f': + return true + case 'A' <= c && c <= 'F': + return true + } + return false +} + +func unhex(c byte) byte { + switch { + case '0' <= c && c <= '9': + return c - '0' + case 'a' <= c && c <= 'f': + return c - 'a' + 10 + case 'A' <= c && c <= 'F': + return c - 'A' + 10 + } + return 0 +} + +// validOptionalPort reports whether port is either an empty string +// or matches /^:\d*$/ +func validOptionalPort(port []byte) bool { + if len(port) == 0 { + return true + } + if port[0] != ':' { + return false + } + for _, b := range port[1:] { + if b < '0' || b > '9' { + return false + } + } + return true +} + func normalizePath(dst, src []byte) []byte { dst = dst[:0] dst = addLeadingSlash(dst, src) |