escape.gno

7.08 Kb ยท 298 lines
  1package json
  2
  3import (
  4	"unicode/utf8"
  5)
  6
  7const (
  8	supplementalPlanesOffset     = 0x10000
  9	highSurrogateOffset          = 0xD800
 10	lowSurrogateOffset           = 0xDC00
 11	surrogateEnd                 = 0xDFFF
 12	basicMultilingualPlaneOffset = 0xFFFF
 13	badHex                       = -1
 14
 15	singleUnicodeEscapeLen = 6
 16	surrogatePairLen       = 12
 17)
 18
 19var hexLookupTable = [256]int{
 20	'0': 0x0, '1': 0x1, '2': 0x2, '3': 0x3, '4': 0x4,
 21	'5': 0x5, '6': 0x6, '7': 0x7, '8': 0x8, '9': 0x9,
 22	'A': 0xA, 'B': 0xB, 'C': 0xC, 'D': 0xD, 'E': 0xE, 'F': 0xF,
 23	'a': 0xA, 'b': 0xB, 'c': 0xC, 'd': 0xD, 'e': 0xE, 'f': 0xF,
 24	// Fill unspecified index-value pairs with key and value of -1
 25	'G': -1, 'H': -1, 'I': -1, 'J': -1,
 26	'K': -1, 'L': -1, 'M': -1, 'N': -1,
 27	'O': -1, 'P': -1, 'Q': -1, 'R': -1,
 28	'S': -1, 'T': -1, 'U': -1, 'V': -1,
 29	'W': -1, 'X': -1, 'Y': -1, 'Z': -1,
 30	'g': -1, 'h': -1, 'i': -1, 'j': -1,
 31	'k': -1, 'l': -1, 'm': -1, 'n': -1,
 32	'o': -1, 'p': -1, 'q': -1, 'r': -1,
 33	's': -1, 't': -1, 'u': -1, 'v': -1,
 34	'w': -1, 'x': -1, 'y': -1, 'z': -1,
 35}
 36
 37func h2i(c byte) int {
 38	return hexLookupTable[c]
 39}
 40
 41// Unescape takes an input byte slice, processes it to Unescape certain characters,
 42// and writes the result into an output byte slice.
 43//
 44// it returns the processed slice and any error encountered during the Unescape operation.
 45func Unescape(input, output []byte) ([]byte, error) {
 46	// ensure the output slice has enough capacity to hold the input slice.
 47	inputLen := len(input)
 48	if cap(output) < inputLen {
 49		output = make([]byte, inputLen)
 50	}
 51
 52	inPos, outPos := 0, 0
 53
 54	for inPos < len(input) {
 55		c := input[inPos]
 56		if c != backSlash {
 57			output[outPos] = c
 58			inPos++
 59			outPos++
 60		} else {
 61			// process escape sequence
 62			inLen, outLen, err := processEscapedUTF8(input[inPos:], output[outPos:])
 63			if err != nil {
 64				return nil, err
 65			}
 66			inPos += inLen
 67			outPos += outLen
 68		}
 69	}
 70
 71	return output[:outPos], nil
 72}
 73
 74// isSurrogatePair returns true if the rune is a surrogate pair.
 75//
 76// A surrogate pairs are used in UTF-16 encoding to encode characters
 77// outside the Basic Multilingual Plane (BMP).
 78func isSurrogatePair(r rune) bool {
 79	return highSurrogateOffset <= r && r <= surrogateEnd
 80}
 81
 82// isHighSurrogate checks if the rune is a high surrogate (U+D800 to U+DBFF).
 83func isHighSurrogate(r rune) bool {
 84	return r >= highSurrogateOffset && r <= 0xDBFF
 85}
 86
 87// isLowSurrogate checks if the rune is a low surrogate (U+DC00 to U+DFFF).
 88func isLowSurrogate(r rune) bool {
 89	return r >= lowSurrogateOffset && r <= surrogateEnd
 90}
 91
 92// combineSurrogates reconstruct the original unicode code points in the
 93// supplemental plane by combinin the high and low surrogate.
 94//
 95// The hight surrogate in the range from U+D800 to U+DBFF,
 96// and the low surrogate in the range from U+DC00 to U+DFFF.
 97//
 98// The formula to combine the surrogates is:
 99// (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000
100func combineSurrogates(high, low rune) rune {
101	return ((high - highSurrogateOffset) << 10) + (low - lowSurrogateOffset) + supplementalPlanesOffset
102}
103
104// deocdeSingleUnicodeEscape decodes a unicode escape sequence (e.g., \uXXXX) into a rune.
105func decodeSingleUnicodeEscape(b []byte) (rune, bool) {
106	if len(b) < 6 {
107		return utf8.RuneError, false
108	}
109
110	// convert hex to decimal
111	h1, h2, h3, h4 := h2i(b[2]), h2i(b[3]), h2i(b[4]), h2i(b[5])
112	if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex {
113		return utf8.RuneError, false
114	}
115
116	return rune(h1<<12 + h2<<8 + h3<<4 + h4), true
117}
118
119// decodeUnicodeEscape decodes a Unicode escape sequence from a byte slice.
120// It handles both single Unicode escape sequences and surrogate pairs.
121func decodeUnicodeEscape(b []byte) (rune, int) {
122	// decode the first Unicode escape sequence.
123	r, ok := decodeSingleUnicodeEscape(b)
124	if !ok {
125		return utf8.RuneError, -1
126	}
127
128	// if the rune is within the BMP and not a surrogate, return it
129	if r <= basicMultilingualPlaneOffset && !isSurrogatePair(r) {
130		return r, 6
131	}
132
133	if !isHighSurrogate(r) {
134		// invalid surrogate pair.
135		return utf8.RuneError, -1
136	}
137
138	// if the rune is a high surrogate, need to decode the next escape sequence.
139
140	// ensure there are enough bytes for the next escape sequence.
141	if len(b) < surrogatePairLen {
142		return utf8.RuneError, -1
143	}
144	// decode the second Unicode escape sequence.
145	r2, ok := decodeSingleUnicodeEscape(b[singleUnicodeEscapeLen:])
146	if !ok {
147		return utf8.RuneError, -1
148	}
149	// check if the second rune is a low surrogate.
150	if isLowSurrogate(r2) {
151		combined := combineSurrogates(r, r2)
152		return combined, surrogatePairLen
153	}
154	return utf8.RuneError, -1
155}
156
157var escapeByteSet = [256]byte{
158	'"':  doubleQuote,
159	'\\': backSlash,
160	'/':  slash,
161	'b':  backSpace,
162	'f':  formFeed,
163	'n':  newLine,
164	'r':  carriageReturn,
165	't':  tab,
166}
167
168// Unquote takes a byte slice and unquotes it by removing
169// the surrounding quotes and unescaping the contents.
170func Unquote(s []byte, border byte) (string, bool) {
171	s, ok := unquoteBytes(s, border)
172	return string(s), ok
173}
174
175// unquoteBytes takes a byte slice and unquotes it by removing
176func unquoteBytes(s []byte, border byte) ([]byte, bool) {
177	if len(s) < 2 || s[0] != border || s[len(s)-1] != border {
178		return nil, false
179	}
180
181	s = s[1 : len(s)-1]
182
183	r := 0
184	for r < len(s) {
185		c := s[r]
186
187		if c == backSlash || c == border || c < 0x20 {
188			break
189		}
190
191		if c < utf8.RuneSelf {
192			r++
193			continue
194		}
195
196		rr, size := utf8.DecodeRune(s[r:])
197		if rr == utf8.RuneError && size == 1 {
198			break
199		}
200
201		r += size
202	}
203
204	if r == len(s) {
205		return s, true
206	}
207
208	utfDoubleMax := utf8.UTFMax * 2
209	b := make([]byte, len(s)+utfDoubleMax)
210	w := copy(b, s[0:r])
211
212	for r < len(s) {
213		if w >= len(b)-utf8.UTFMax {
214			nb := make([]byte, utfDoubleMax+(2*len(b)))
215			copy(nb, b)
216			b = nb
217		}
218
219		c := s[r]
220		if c == backSlash {
221			r++
222			if r >= len(s) {
223				return nil, false
224			}
225
226			if s[r] == 'u' {
227				rr, res := decodeUnicodeEscape(s[r-1:])
228				if res < 0 {
229					return nil, false
230				}
231
232				w += utf8.EncodeRune(b[w:], rr)
233				r += 5
234			} else {
235				decode := escapeByteSet[s[r]]
236				if decode == 0 {
237					return nil, false
238				}
239
240				if decode == doubleQuote || decode == backSlash || decode == slash {
241					decode = s[r]
242				}
243
244				b[w] = decode
245				r++
246				w++
247			}
248		} else if c == border || c < 0x20 {
249			return nil, false
250		} else if c < utf8.RuneSelf {
251			b[w] = c
252			r++
253			w++
254		} else {
255			rr, size := utf8.DecodeRune(s[r:])
256
257			if rr == utf8.RuneError && size == 1 {
258				return nil, false
259			}
260
261			r += size
262			w += utf8.EncodeRune(b[w:], rr)
263		}
264	}
265
266	return b[:w], true
267}
268
269// processEscapedUTF8 converts escape sequences to UTF-8 characters.
270// It decodes Unicode escape sequences (\uXXXX) to UTF-8 and
271// converts standard escape sequences (e.g., \n) to their corresponding special characters.
272func processEscapedUTF8(in, out []byte) (int, int, error) {
273	if len(in) < 2 || in[0] != backSlash {
274		return -1, -1, errInvalidEscapeSequence
275	}
276
277	escapeSeqLen := 2
278	escapeChar := in[1]
279
280	if escapeChar != 'u' {
281		val := escapeByteSet[escapeChar]
282		if val == 0 {
283			return -1, -1, errInvalidEscapeSequence
284		}
285
286		out[0] = val
287		return escapeSeqLen, 1, nil
288	}
289
290	r, size := decodeUnicodeEscape(in)
291	if size == -1 {
292		return -1, -1, errInvalidEscapeSequence
293	}
294
295	outLen := utf8.EncodeRune(out, r)
296
297	return size, outLen, nil
298}