iri/iri.go at main · contomap/iri · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
package iri

import (
	"fmt"
	"strings"
	"unicode/utf8"
)

// An IRI (Internationalized Resource Identifier) is a Unicode string [UNICODE]
// that conforms to the syntax defined in RFC 3987.
//
// This type is not a "drop-in" replacement for "net/uri.URI". See package
// comments for details.
//
// See https://www.ietf.org/rfc/rfc3987.html
type IRI struct {
	Scheme         string
	ForceAuthority bool // append a double-slash ('//') even if Authority is empty
	Authority      string
	Path           string
	ForceQuery     bool // append a query ('?') even if Query is empty
	Query          string
	ForceFragment  bool // append a fragment ('#') even if Fragment field is empty
	Fragment       string
}

// Parse parses a string into an IRI and checks that it conforms to RFC 3987.
//
// It performs a coarse segmentation based on a regular expression to separate the components,
// and then verifies with detailed regular expressions whether the components are correct.
// Finally, any percent-encoding is verified - yet the returned IRI will have the original percent encoding
// maintained.
// If any of these steps produce an error, this function returns an error and an empty IRI.
func Parse(s string) (IRI, error) {
	match := uriRE.FindStringSubmatch(s) // It is not possible to not match the regular expression; If it is, add a test
	scheme := match[uriRESchemeGroup]
	authority := match[uriREAuthorityGroup]
	path := match[uriREPathGroup]
	query := match[uriREQueryGroup]
	fragment := match[uriREFragmentGroup]
	if scheme != "" && !schemeRE.MatchString(scheme) {
		return IRI{}, fmt.Errorf("%q is not a valid IRI: invalid scheme %q does not match regexp %s", s, scheme, schemeRE)
	}
	if authority != "" && !iauthorityRE.MatchString(authority) {
		return IRI{}, fmt.Errorf("%q is not a valid IRI: invalid authority %q does not match regexp %s", s, authority, iauthorityRE)
	}
	if path != "" && !ipathRE.MatchString(path) {
		return IRI{}, fmt.Errorf("%q is not a valid IRI: invalid path %q does not match regexp %s", s, path, ipathRE)
	}
	if query != "" && !iqueryRE.MatchString(query) {
		return IRI{}, fmt.Errorf("%q is not a valid IRI: invalid query %q does not match regexp %s", s, query, iqueryRE)
	}
	if fragment != "" && !ifragmentRE.MatchString(fragment) {
		return IRI{}, fmt.Errorf("%q is not a valid IRI: invalid fragment %q does not match regexp %s", s, fragment, ifragmentRE)
	}

	parsed := IRI{
		Scheme:         scheme,
		ForceAuthority: len(match[uriREAuthorityWithSlashSlashGroup]) != 0 && (len(authority) == 0),
		Authority:      authority,
		Path:           path,
		ForceQuery:     match[uriREQueryWithMarkGroup] != "" && (len(query) == 0),
		Query:          query,
		ForceFragment:  match[uriREFragmentWithHashGroup] != "" && (len(fragment) == 0),
		Fragment:       fragment,
	}

	if _, err := NormalizePercentEncoding(parsed); err != nil {
		return IRI{}, fmt.Errorf("%q is not a valid IRI: invalid percent encoding: %w", s, err)
	}

	return parsed, nil
}

// String reassembles the IRI into an IRI string.
// Any components that have been manually set must comply to the format;
// This function performs no further escaping.
func (iri IRI) String() string {
	var result strings.Builder
	if iri.hasScheme() {
		result.WriteString(iri.Scheme)
		result.WriteRune(':')
	}
	if iri.hasAuthority() {
		result.WriteString("//")
		result.WriteString(iri.Authority)
	}
	result.WriteString(iri.Path)
	if iri.hasQuery() {
		result.WriteRune('?')
		result.WriteString(iri.Query)
	}
	if iri.hasFragment() {
		result.WriteRune('#')
		result.WriteString(iri.Fragment)
	}
	return result.String()
}

func (iri IRI) hasScheme() bool    { return iri.Scheme != "" }
func (iri IRI) hasAuthority() bool { return iri.ForceAuthority || iri.Authority != "" }
func (iri IRI) hasQuery() bool     { return iri.ForceQuery || iri.Query != "" }
func (iri IRI) hasFragment() bool  { return iri.ForceFragment || iri.Fragment != "" }

// ResolveReference resolves an IRI reference to an absolute IRI from an absolute
// base IRI, per RFC 3986 Section 5.2. The IRI reference may be relative or absolute.
func (iri IRI) ResolveReference(other IRI) IRI {
	return resolveReference(iri, other)
}

// NormalizePercentEncoding returns an IRI that replaces any unnecessarily
// percent-escaped characters with unescaped characters.
//
// RFC3987 discusses this normalization procedure in 5.3.2.3:
// https://www.ietf.org/rfc/rfc3987.html#section-5.3.2.3.
func NormalizePercentEncoding(iri IRI) (IRI, error) {
	replaced := iri
	var err error
	replaced.Authority, err = normalizePercentEncoding(iri.Authority)
	if err != nil {
		return IRI{}, err
	}
	replaced.Path, err = normalizePercentEncoding(iri.Path)
	if err != nil {
		return IRI{}, err
	}
	replaced.Query, err = normalizePercentEncoding(iri.Query)
	if err != nil {
		return IRI{}, err
	}
	replaced.Fragment, err = normalizePercentEncoding(iri.Fragment)
	if err != nil {
		return IRI{}, err
	}
	return replaced, nil
}

// normalizePercentEncoding replaces unreserved percent-encoded characters with their equivalent.
//
// Normalization background reading:
// - https://blog.golang.org/normalization
// - https://www.ietf.org/rfc/rfc3987.html#section-5
//   - https://www.ietf.org/rfc/rfc3987.html#section-5.3.2.3 - percent encoding
func normalizePercentEncoding(in string) (string, error) {
	var errs []error
	replaced := pctEncodedCharOneOrMore.ReplaceAllStringFunc(in, func(pctEscaped string) string {
		normalized := ""
		unconsumedOctets := octetsFrom(pctEscaped)
		octetsOffset := 0
		for len(unconsumedOctets) > 0 {
			codePoint, size := utf8.DecodeRune(unconsumedOctets)
			if codePoint == utf8.RuneError {
				errs = append(errs, fmt.Errorf("percent-encoded sequence %q contains invalid UTF-8 code point at start", pctEscaped[octetsOffset*3:]))
				return pctEscaped
			}
			normalized += toUnreservedString(codePoint)
			unconsumedOctets = unconsumedOctets[size:]
			octetsOffset += size
		}
		return normalized
	})
	if len(errs) != 0 {
		return "", errs[0]
	}
	return replaced, nil
}

var (
	hexToByte = func() map[string]byte {
		m := map[string]byte{}
		for i := 0; i <= 255; i++ {
			m[fmt.Sprintf("%02X", i)] = byte(i)
		}
		return m
	}()
	byteToUppercasePercentEncoding = func() map[byte]string {
		m := map[byte]string{}
		for i := 0; i <= 255; i++ {
			m[byte(i)] = fmt.Sprintf("%%%02X", i)
		}
		return m
	}()
)

func octetsFrom(percentEncoded string) []byte {
	octets := make([]byte, len(percentEncoded)/3)
	for i := 0; i < len(octets); i++ {
		start := i * 3
		digitsStr := strings.ToUpper(percentEncoded[start+1 : start+3])
		octet := hexToByte[digitsStr]
		octets[i] = octet
	}
	return octets
}

func toUnreservedString(r rune) string {
	isUnreserved := iunreservedRE.MatchString(string(r))
	if isUnreserved {
		return string(r)
	}
	var percentEncoded string
	var buf [utf8.UTFMax]byte
	octetCount := utf8.EncodeRune(buf[:], r)
	for i := 0; i < octetCount; i++ {
		percentEncoded += byteToUppercasePercentEncoding[buf[i]]
	}
	return percentEncoded
}