utf8.js (5145B)
1 /*! https://mths.be/utf8js v2.1.2 by @mathias */ 2 3 var stringFromCharCode = String.fromCharCode; 4 5 // Taken from https://mths.be/punycode 6 function ucs2decode(string) { 7 var output = []; 8 var counter = 0; 9 var length = string.length; 10 var value; 11 var extra; 12 while (counter < length) { 13 value = string.charCodeAt(counter++); 14 if (value >= 0xD800 && value <= 0xDBFF && counter < length) { 15 // high surrogate, and there is a next character 16 extra = string.charCodeAt(counter++); 17 if ((extra & 0xFC00) == 0xDC00) { // low surrogate 18 output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000); 19 } else { 20 // unmatched surrogate; only append this code unit, in case the next 21 // code unit is the high surrogate of a surrogate pair 22 output.push(value); 23 counter--; 24 } 25 } else { 26 output.push(value); 27 } 28 } 29 return output; 30 } 31 32 // Taken from https://mths.be/punycode 33 function ucs2encode(array) { 34 var length = array.length; 35 var index = -1; 36 var value; 37 var output = ''; 38 while (++index < length) { 39 value = array[index]; 40 if (value > 0xFFFF) { 41 value -= 0x10000; 42 output += stringFromCharCode(value >>> 10 & 0x3FF | 0xD800); 43 value = 0xDC00 | value & 0x3FF; 44 } 45 output += stringFromCharCode(value); 46 } 47 return output; 48 } 49 50 function checkScalarValue(codePoint, strict) { 51 if (codePoint >= 0xD800 && codePoint <= 0xDFFF) { 52 if (strict) { 53 throw Error( 54 'Lone surrogate U+' + codePoint.toString(16).toUpperCase() + 55 ' is not a scalar value' 56 ); 57 } 58 return false; 59 } 60 return true; 61 } 62 /*--------------------------------------------------------------------------*/ 63 64 function createByte(codePoint, shift) { 65 return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80); 66 } 67 68 function encodeCodePoint(codePoint, strict) { 69 if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence 70 return stringFromCharCode(codePoint); 71 } 72 var symbol = ''; 73 if ((codePoint & 0xFFFFF800) == 0) { // 2-byte sequence 74 symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0); 75 } 76 else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence 77 if (!checkScalarValue(codePoint, strict)) { 78 codePoint = 0xFFFD; 79 } 80 symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0); 81 symbol += createByte(codePoint, 6); 82 } 83 else if ((codePoint & 0xFFE00000) == 0) { // 4-byte sequence 84 symbol = stringFromCharCode(((codePoint >> 18) & 0x07) | 0xF0); 85 symbol += createByte(codePoint, 12); 86 symbol += createByte(codePoint, 6); 87 } 88 symbol += stringFromCharCode((codePoint & 0x3F) | 0x80); 89 return symbol; 90 } 91 92 function utf8encode(string, opts) { 93 opts = opts || {}; 94 var strict = false !== opts.strict; 95 96 var codePoints = ucs2decode(string); 97 var length = codePoints.length; 98 var index = -1; 99 var codePoint; 100 var byteString = ''; 101 while (++index < length) { 102 codePoint = codePoints[index]; 103 byteString += encodeCodePoint(codePoint, strict); 104 } 105 return byteString; 106 } 107 108 /*--------------------------------------------------------------------------*/ 109 110 function readContinuationByte() { 111 if (byteIndex >= byteCount) { 112 throw Error('Invalid byte index'); 113 } 114 115 var continuationByte = byteArray[byteIndex] & 0xFF; 116 byteIndex++; 117 118 if ((continuationByte & 0xC0) == 0x80) { 119 return continuationByte & 0x3F; 120 } 121 122 // If we end up here, it’s not a continuation byte 123 throw Error('Invalid continuation byte'); 124 } 125 126 function decodeSymbol(strict) { 127 var byte1; 128 var byte2; 129 var byte3; 130 var byte4; 131 var codePoint; 132 133 if (byteIndex > byteCount) { 134 throw Error('Invalid byte index'); 135 } 136 137 if (byteIndex == byteCount) { 138 return false; 139 } 140 141 // Read first byte 142 byte1 = byteArray[byteIndex] & 0xFF; 143 byteIndex++; 144 145 // 1-byte sequence (no continuation bytes) 146 if ((byte1 & 0x80) == 0) { 147 return byte1; 148 } 149 150 // 2-byte sequence 151 if ((byte1 & 0xE0) == 0xC0) { 152 byte2 = readContinuationByte(); 153 codePoint = ((byte1 & 0x1F) << 6) | byte2; 154 if (codePoint >= 0x80) { 155 return codePoint; 156 } else { 157 throw Error('Invalid continuation byte'); 158 } 159 } 160 161 // 3-byte sequence (may include unpaired surrogates) 162 if ((byte1 & 0xF0) == 0xE0) { 163 byte2 = readContinuationByte(); 164 byte3 = readContinuationByte(); 165 codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3; 166 if (codePoint >= 0x0800) { 167 return checkScalarValue(codePoint, strict) ? codePoint : 0xFFFD; 168 } else { 169 throw Error('Invalid continuation byte'); 170 } 171 } 172 173 // 4-byte sequence 174 if ((byte1 & 0xF8) == 0xF0) { 175 byte2 = readContinuationByte(); 176 byte3 = readContinuationByte(); 177 byte4 = readContinuationByte(); 178 codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | 179 (byte3 << 0x06) | byte4; 180 if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) { 181 return codePoint; 182 } 183 } 184 185 throw Error('Invalid UTF-8 detected'); 186 } 187 188 var byteArray; 189 var byteCount; 190 var byteIndex; 191 function utf8decode(byteString, opts) { 192 opts = opts || {}; 193 var strict = false !== opts.strict; 194 195 byteArray = ucs2decode(byteString); 196 byteCount = byteArray.length; 197 byteIndex = 0; 198 var codePoints = []; 199 var tmp; 200 while ((tmp = decodeSymbol(strict)) !== false) { 201 codePoints.push(tmp); 202 } 203 return ucs2encode(codePoints); 204 } 205 206 module.exports = { 207 version: '2.1.2', 208 encode: utf8encode, 209 decode: utf8decode 210 };