escape.gno
7.08 Kb ยท 298 lines
1package json
2
3import (
4 "unicode/utf8"
5)
6
7const (
8 supplementalPlanesOffset = 0x10000
9 highSurrogateOffset = 0xD800
10 lowSurrogateOffset = 0xDC00
11 surrogateEnd = 0xDFFF
12 basicMultilingualPlaneOffset = 0xFFFF
13 badHex = -1
14
15 singleUnicodeEscapeLen = 6
16 surrogatePairLen = 12
17)
18
19var hexLookupTable = [256]int{
20 '0': 0x0, '1': 0x1, '2': 0x2, '3': 0x3, '4': 0x4,
21 '5': 0x5, '6': 0x6, '7': 0x7, '8': 0x8, '9': 0x9,
22 'A': 0xA, 'B': 0xB, 'C': 0xC, 'D': 0xD, 'E': 0xE, 'F': 0xF,
23 'a': 0xA, 'b': 0xB, 'c': 0xC, 'd': 0xD, 'e': 0xE, 'f': 0xF,
24 // Fill unspecified index-value pairs with key and value of -1
25 'G': -1, 'H': -1, 'I': -1, 'J': -1,
26 'K': -1, 'L': -1, 'M': -1, 'N': -1,
27 'O': -1, 'P': -1, 'Q': -1, 'R': -1,
28 'S': -1, 'T': -1, 'U': -1, 'V': -1,
29 'W': -1, 'X': -1, 'Y': -1, 'Z': -1,
30 'g': -1, 'h': -1, 'i': -1, 'j': -1,
31 'k': -1, 'l': -1, 'm': -1, 'n': -1,
32 'o': -1, 'p': -1, 'q': -1, 'r': -1,
33 's': -1, 't': -1, 'u': -1, 'v': -1,
34 'w': -1, 'x': -1, 'y': -1, 'z': -1,
35}
36
37func h2i(c byte) int {
38 return hexLookupTable[c]
39}
40
41// Unescape takes an input byte slice, processes it to Unescape certain characters,
42// and writes the result into an output byte slice.
43//
44// it returns the processed slice and any error encountered during the Unescape operation.
45func Unescape(input, output []byte) ([]byte, error) {
46 // ensure the output slice has enough capacity to hold the input slice.
47 inputLen := len(input)
48 if cap(output) < inputLen {
49 output = make([]byte, inputLen)
50 }
51
52 inPos, outPos := 0, 0
53
54 for inPos < len(input) {
55 c := input[inPos]
56 if c != backSlash {
57 output[outPos] = c
58 inPos++
59 outPos++
60 } else {
61 // process escape sequence
62 inLen, outLen, err := processEscapedUTF8(input[inPos:], output[outPos:])
63 if err != nil {
64 return nil, err
65 }
66 inPos += inLen
67 outPos += outLen
68 }
69 }
70
71 return output[:outPos], nil
72}
73
74// isSurrogatePair returns true if the rune is a surrogate pair.
75//
76// A surrogate pairs are used in UTF-16 encoding to encode characters
77// outside the Basic Multilingual Plane (BMP).
78func isSurrogatePair(r rune) bool {
79 return highSurrogateOffset <= r && r <= surrogateEnd
80}
81
82// isHighSurrogate checks if the rune is a high surrogate (U+D800 to U+DBFF).
83func isHighSurrogate(r rune) bool {
84 return r >= highSurrogateOffset && r <= 0xDBFF
85}
86
87// isLowSurrogate checks if the rune is a low surrogate (U+DC00 to U+DFFF).
88func isLowSurrogate(r rune) bool {
89 return r >= lowSurrogateOffset && r <= surrogateEnd
90}
91
92// combineSurrogates reconstruct the original unicode code points in the
93// supplemental plane by combinin the high and low surrogate.
94//
95// The hight surrogate in the range from U+D800 to U+DBFF,
96// and the low surrogate in the range from U+DC00 to U+DFFF.
97//
98// The formula to combine the surrogates is:
99// (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000
100func combineSurrogates(high, low rune) rune {
101 return ((high - highSurrogateOffset) << 10) + (low - lowSurrogateOffset) + supplementalPlanesOffset
102}
103
104// deocdeSingleUnicodeEscape decodes a unicode escape sequence (e.g., \uXXXX) into a rune.
105func decodeSingleUnicodeEscape(b []byte) (rune, bool) {
106 if len(b) < 6 {
107 return utf8.RuneError, false
108 }
109
110 // convert hex to decimal
111 h1, h2, h3, h4 := h2i(b[2]), h2i(b[3]), h2i(b[4]), h2i(b[5])
112 if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex {
113 return utf8.RuneError, false
114 }
115
116 return rune(h1<<12 + h2<<8 + h3<<4 + h4), true
117}
118
119// decodeUnicodeEscape decodes a Unicode escape sequence from a byte slice.
120// It handles both single Unicode escape sequences and surrogate pairs.
121func decodeUnicodeEscape(b []byte) (rune, int) {
122 // decode the first Unicode escape sequence.
123 r, ok := decodeSingleUnicodeEscape(b)
124 if !ok {
125 return utf8.RuneError, -1
126 }
127
128 // if the rune is within the BMP and not a surrogate, return it
129 if r <= basicMultilingualPlaneOffset && !isSurrogatePair(r) {
130 return r, 6
131 }
132
133 if !isHighSurrogate(r) {
134 // invalid surrogate pair.
135 return utf8.RuneError, -1
136 }
137
138 // if the rune is a high surrogate, need to decode the next escape sequence.
139
140 // ensure there are enough bytes for the next escape sequence.
141 if len(b) < surrogatePairLen {
142 return utf8.RuneError, -1
143 }
144 // decode the second Unicode escape sequence.
145 r2, ok := decodeSingleUnicodeEscape(b[singleUnicodeEscapeLen:])
146 if !ok {
147 return utf8.RuneError, -1
148 }
149 // check if the second rune is a low surrogate.
150 if isLowSurrogate(r2) {
151 combined := combineSurrogates(r, r2)
152 return combined, surrogatePairLen
153 }
154 return utf8.RuneError, -1
155}
156
157var escapeByteSet = [256]byte{
158 '"': doubleQuote,
159 '\\': backSlash,
160 '/': slash,
161 'b': backSpace,
162 'f': formFeed,
163 'n': newLine,
164 'r': carriageReturn,
165 't': tab,
166}
167
168// Unquote takes a byte slice and unquotes it by removing
169// the surrounding quotes and unescaping the contents.
170func Unquote(s []byte, border byte) (string, bool) {
171 s, ok := unquoteBytes(s, border)
172 return string(s), ok
173}
174
175// unquoteBytes takes a byte slice and unquotes it by removing
176func unquoteBytes(s []byte, border byte) ([]byte, bool) {
177 if len(s) < 2 || s[0] != border || s[len(s)-1] != border {
178 return nil, false
179 }
180
181 s = s[1 : len(s)-1]
182
183 r := 0
184 for r < len(s) {
185 c := s[r]
186
187 if c == backSlash || c == border || c < 0x20 {
188 break
189 }
190
191 if c < utf8.RuneSelf {
192 r++
193 continue
194 }
195
196 rr, size := utf8.DecodeRune(s[r:])
197 if rr == utf8.RuneError && size == 1 {
198 break
199 }
200
201 r += size
202 }
203
204 if r == len(s) {
205 return s, true
206 }
207
208 utfDoubleMax := utf8.UTFMax * 2
209 b := make([]byte, len(s)+utfDoubleMax)
210 w := copy(b, s[0:r])
211
212 for r < len(s) {
213 if w >= len(b)-utf8.UTFMax {
214 nb := make([]byte, utfDoubleMax+(2*len(b)))
215 copy(nb, b)
216 b = nb
217 }
218
219 c := s[r]
220 if c == backSlash {
221 r++
222 if r >= len(s) {
223 return nil, false
224 }
225
226 if s[r] == 'u' {
227 rr, res := decodeUnicodeEscape(s[r-1:])
228 if res < 0 {
229 return nil, false
230 }
231
232 w += utf8.EncodeRune(b[w:], rr)
233 r += 5
234 } else {
235 decode := escapeByteSet[s[r]]
236 if decode == 0 {
237 return nil, false
238 }
239
240 if decode == doubleQuote || decode == backSlash || decode == slash {
241 decode = s[r]
242 }
243
244 b[w] = decode
245 r++
246 w++
247 }
248 } else if c == border || c < 0x20 {
249 return nil, false
250 } else if c < utf8.RuneSelf {
251 b[w] = c
252 r++
253 w++
254 } else {
255 rr, size := utf8.DecodeRune(s[r:])
256
257 if rr == utf8.RuneError && size == 1 {
258 return nil, false
259 }
260
261 r += size
262 w += utf8.EncodeRune(b[w:], rr)
263 }
264 }
265
266 return b[:w], true
267}
268
269// processEscapedUTF8 converts escape sequences to UTF-8 characters.
270// It decodes Unicode escape sequences (\uXXXX) to UTF-8 and
271// converts standard escape sequences (e.g., \n) to their corresponding special characters.
272func processEscapedUTF8(in, out []byte) (int, int, error) {
273 if len(in) < 2 || in[0] != backSlash {
274 return -1, -1, errInvalidEscapeSequence
275 }
276
277 escapeSeqLen := 2
278 escapeChar := in[1]
279
280 if escapeChar != 'u' {
281 val := escapeByteSet[escapeChar]
282 if val == 0 {
283 return -1, -1, errInvalidEscapeSequence
284 }
285
286 out[0] = val
287 return escapeSeqLen, 1, nil
288 }
289
290 r, size := decodeUnicodeEscape(in)
291 if size == -1 {
292 return -1, -1, errInvalidEscapeSequence
293 }
294
295 outLen := utf8.EncodeRune(out, r)
296
297 return size, outLen, nil
298}