dbcs-codec.js (21415B)
1 "use strict"; 2 var Buffer = require("safer-buffer").Buffer; 3 4 // Multibyte codec. In this scheme, a character is represented by 1 or more bytes. 5 // Our codec supports UTF-16 surrogates, extensions for GB18030 and unicode sequences. 6 // To save memory and loading time, we read table files only when requested. 7 8 exports._dbcs = DBCSCodec; 9 10 var UNASSIGNED = -1, 11 GB18030_CODE = -2, 12 SEQ_START = -10, 13 NODE_START = -1000, 14 UNASSIGNED_NODE = new Array(0x100), 15 DEF_CHAR = -1; 16 17 for (var i = 0; i < 0x100; i++) 18 UNASSIGNED_NODE[i] = UNASSIGNED; 19 20 21 // Class DBCSCodec reads and initializes mapping tables. 22 function DBCSCodec(codecOptions, iconv) { 23 this.encodingName = codecOptions.encodingName; 24 if (!codecOptions) 25 throw new Error("DBCS codec is called without the data.") 26 if (!codecOptions.table) 27 throw new Error("Encoding '" + this.encodingName + "' has no data."); 28 29 // Load tables. 30 var mappingTable = codecOptions.table(); 31 32 33 // Decode tables: MBCS -> Unicode. 34 35 // decodeTables is a trie, encoded as an array of arrays of integers. Internal arrays are trie nodes and all have len = 256. 36 // Trie root is decodeTables[0]. 37 // Values: >= 0 -> unicode character code. can be > 0xFFFF 38 // == UNASSIGNED -> unknown/unassigned sequence. 39 // == GB18030_CODE -> this is the end of a GB18030 4-byte sequence. 40 // <= NODE_START -> index of the next node in our trie to process next byte. 41 // <= SEQ_START -> index of the start of a character code sequence, in decodeTableSeq. 42 this.decodeTables = []; 43 this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node. 44 45 // Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here. 46 this.decodeTableSeq = []; 47 48 // Actual mapping tables consist of chunks. Use them to fill up decode tables. 49 for (var i = 0; i < mappingTable.length; i++) 50 this._addDecodeChunk(mappingTable[i]); 51 52 this.defaultCharUnicode = iconv.defaultCharUnicode; 53 54 55 // Encode tables: Unicode -> DBCS. 56 57 // `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance. 58 // Because it can be sparse, it is represented as array of buckets by 256 chars each. Bucket can be null. 59 // Values: >= 0 -> it is a normal char. Write the value (if <=256 then 1 byte, if <=65536 then 2 bytes, etc.). 60 // == UNASSIGNED -> no conversion found. Output a default char. 61 // <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence. 62 this.encodeTable = []; 63 64 // `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of 65 // objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key 66 // means end of sequence (needed when one sequence is a strict subsequence of another). 67 // Objects are kept separately from encodeTable to increase performance. 68 this.encodeTableSeq = []; 69 70 // Some chars can be decoded, but need not be encoded. 71 var skipEncodeChars = {}; 72 if (codecOptions.encodeSkipVals) 73 for (var i = 0; i < codecOptions.encodeSkipVals.length; i++) { 74 var val = codecOptions.encodeSkipVals[i]; 75 if (typeof val === 'number') 76 skipEncodeChars[val] = true; 77 else 78 for (var j = val.from; j <= val.to; j++) 79 skipEncodeChars[j] = true; 80 } 81 82 // Use decode trie to recursively fill out encode tables. 83 this._fillEncodeTable(0, 0, skipEncodeChars); 84 85 // Add more encoding pairs when needed. 86 if (codecOptions.encodeAdd) { 87 for (var uChar in codecOptions.encodeAdd) 88 if (Object.prototype.hasOwnProperty.call(codecOptions.encodeAdd, uChar)) 89 this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]); 90 } 91 92 this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)]; 93 if (this.defCharSB === UNASSIGNED) this.defCharSB = this.encodeTable[0]['?']; 94 if (this.defCharSB === UNASSIGNED) this.defCharSB = "?".charCodeAt(0); 95 96 97 // Load & create GB18030 tables when needed. 98 if (typeof codecOptions.gb18030 === 'function') { 99 this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges. 100 101 // Add GB18030 decode tables. 102 var thirdByteNodeIdx = this.decodeTables.length; 103 var thirdByteNode = this.decodeTables[thirdByteNodeIdx] = UNASSIGNED_NODE.slice(0); 104 105 var fourthByteNodeIdx = this.decodeTables.length; 106 var fourthByteNode = this.decodeTables[fourthByteNodeIdx] = UNASSIGNED_NODE.slice(0); 107 108 for (var i = 0x81; i <= 0xFE; i++) { 109 var secondByteNodeIdx = NODE_START - this.decodeTables[0][i]; 110 var secondByteNode = this.decodeTables[secondByteNodeIdx]; 111 for (var j = 0x30; j <= 0x39; j++) 112 secondByteNode[j] = NODE_START - thirdByteNodeIdx; 113 } 114 for (var i = 0x81; i <= 0xFE; i++) 115 thirdByteNode[i] = NODE_START - fourthByteNodeIdx; 116 for (var i = 0x30; i <= 0x39; i++) 117 fourthByteNode[i] = GB18030_CODE 118 } 119 } 120 121 DBCSCodec.prototype.encoder = DBCSEncoder; 122 DBCSCodec.prototype.decoder = DBCSDecoder; 123 124 // Decoder helpers 125 DBCSCodec.prototype._getDecodeTrieNode = function(addr) { 126 var bytes = []; 127 for (; addr > 0; addr >>= 8) 128 bytes.push(addr & 0xFF); 129 if (bytes.length == 0) 130 bytes.push(0); 131 132 var node = this.decodeTables[0]; 133 for (var i = bytes.length-1; i > 0; i--) { // Traverse nodes deeper into the trie. 134 var val = node[bytes[i]]; 135 136 if (val == UNASSIGNED) { // Create new node. 137 node[bytes[i]] = NODE_START - this.decodeTables.length; 138 this.decodeTables.push(node = UNASSIGNED_NODE.slice(0)); 139 } 140 else if (val <= NODE_START) { // Existing node. 141 node = this.decodeTables[NODE_START - val]; 142 } 143 else 144 throw new Error("Overwrite byte in " + this.encodingName + ", addr: " + addr.toString(16)); 145 } 146 return node; 147 } 148 149 150 DBCSCodec.prototype._addDecodeChunk = function(chunk) { 151 // First element of chunk is the hex mbcs code where we start. 152 var curAddr = parseInt(chunk[0], 16); 153 154 // Choose the decoding node where we'll write our chars. 155 var writeTable = this._getDecodeTrieNode(curAddr); 156 curAddr = curAddr & 0xFF; 157 158 // Write all other elements of the chunk to the table. 159 for (var k = 1; k < chunk.length; k++) { 160 var part = chunk[k]; 161 if (typeof part === "string") { // String, write as-is. 162 for (var l = 0; l < part.length;) { 163 var code = part.charCodeAt(l++); 164 if (0xD800 <= code && code < 0xDC00) { // Decode surrogate 165 var codeTrail = part.charCodeAt(l++); 166 if (0xDC00 <= codeTrail && codeTrail < 0xE000) 167 writeTable[curAddr++] = 0x10000 + (code - 0xD800) * 0x400 + (codeTrail - 0xDC00); 168 else 169 throw new Error("Incorrect surrogate pair in " + this.encodingName + " at chunk " + chunk[0]); 170 } 171 else if (0x0FF0 < code && code <= 0x0FFF) { // Character sequence (our own encoding used) 172 var len = 0xFFF - code + 2; 173 var seq = []; 174 for (var m = 0; m < len; m++) 175 seq.push(part.charCodeAt(l++)); // Simple variation: don't support surrogates or subsequences in seq. 176 177 writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length; 178 this.decodeTableSeq.push(seq); 179 } 180 else 181 writeTable[curAddr++] = code; // Basic char 182 } 183 } 184 else if (typeof part === "number") { // Integer, meaning increasing sequence starting with prev character. 185 var charCode = writeTable[curAddr - 1] + 1; 186 for (var l = 0; l < part; l++) 187 writeTable[curAddr++] = charCode++; 188 } 189 else 190 throw new Error("Incorrect type '" + typeof part + "' given in " + this.encodingName + " at chunk " + chunk[0]); 191 } 192 if (curAddr > 0xFF) 193 throw new Error("Incorrect chunk in " + this.encodingName + " at addr " + chunk[0] + ": too long" + curAddr); 194 } 195 196 // Encoder helpers 197 DBCSCodec.prototype._getEncodeBucket = function(uCode) { 198 var high = uCode >> 8; // This could be > 0xFF because of astral characters. 199 if (this.encodeTable[high] === undefined) 200 this.encodeTable[high] = UNASSIGNED_NODE.slice(0); // Create bucket on demand. 201 return this.encodeTable[high]; 202 } 203 204 DBCSCodec.prototype._setEncodeChar = function(uCode, dbcsCode) { 205 var bucket = this._getEncodeBucket(uCode); 206 var low = uCode & 0xFF; 207 if (bucket[low] <= SEQ_START) 208 this.encodeTableSeq[SEQ_START-bucket[low]][DEF_CHAR] = dbcsCode; // There's already a sequence, set a single-char subsequence of it. 209 else if (bucket[low] == UNASSIGNED) 210 bucket[low] = dbcsCode; 211 } 212 213 DBCSCodec.prototype._setEncodeSequence = function(seq, dbcsCode) { 214 215 // Get the root of character tree according to first character of the sequence. 216 var uCode = seq[0]; 217 var bucket = this._getEncodeBucket(uCode); 218 var low = uCode & 0xFF; 219 220 var node; 221 if (bucket[low] <= SEQ_START) { 222 // There's already a sequence with - use it. 223 node = this.encodeTableSeq[SEQ_START-bucket[low]]; 224 } 225 else { 226 // There was no sequence object - allocate a new one. 227 node = {}; 228 if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low]; // If a char was set before - make it a single-char subsequence. 229 bucket[low] = SEQ_START - this.encodeTableSeq.length; 230 this.encodeTableSeq.push(node); 231 } 232 233 // Traverse the character tree, allocating new nodes as needed. 234 for (var j = 1; j < seq.length-1; j++) { 235 var oldVal = node[uCode]; 236 if (typeof oldVal === 'object') 237 node = oldVal; 238 else { 239 node = node[uCode] = {} 240 if (oldVal !== undefined) 241 node[DEF_CHAR] = oldVal 242 } 243 } 244 245 // Set the leaf to given dbcsCode. 246 uCode = seq[seq.length-1]; 247 node[uCode] = dbcsCode; 248 } 249 250 DBCSCodec.prototype._fillEncodeTable = function(nodeIdx, prefix, skipEncodeChars) { 251 var node = this.decodeTables[nodeIdx]; 252 for (var i = 0; i < 0x100; i++) { 253 var uCode = node[i]; 254 var mbCode = prefix + i; 255 if (skipEncodeChars[mbCode]) 256 continue; 257 258 if (uCode >= 0) 259 this._setEncodeChar(uCode, mbCode); 260 else if (uCode <= NODE_START) 261 this._fillEncodeTable(NODE_START - uCode, mbCode << 8, skipEncodeChars); 262 else if (uCode <= SEQ_START) 263 this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode); 264 } 265 } 266 267 268 269 // == Encoder ================================================================== 270 271 function DBCSEncoder(options, codec) { 272 // Encoder state 273 this.leadSurrogate = -1; 274 this.seqObj = undefined; 275 276 // Static data 277 this.encodeTable = codec.encodeTable; 278 this.encodeTableSeq = codec.encodeTableSeq; 279 this.defaultCharSingleByte = codec.defCharSB; 280 this.gb18030 = codec.gb18030; 281 } 282 283 DBCSEncoder.prototype.write = function(str) { 284 var newBuf = Buffer.alloc(str.length * (this.gb18030 ? 4 : 3)), 285 leadSurrogate = this.leadSurrogate, 286 seqObj = this.seqObj, nextChar = -1, 287 i = 0, j = 0; 288 289 while (true) { 290 // 0. Get next character. 291 if (nextChar === -1) { 292 if (i == str.length) break; 293 var uCode = str.charCodeAt(i++); 294 } 295 else { 296 var uCode = nextChar; 297 nextChar = -1; 298 } 299 300 // 1. Handle surrogates. 301 if (0xD800 <= uCode && uCode < 0xE000) { // Char is one of surrogates. 302 if (uCode < 0xDC00) { // We've got lead surrogate. 303 if (leadSurrogate === -1) { 304 leadSurrogate = uCode; 305 continue; 306 } else { 307 leadSurrogate = uCode; 308 // Double lead surrogate found. 309 uCode = UNASSIGNED; 310 } 311 } else { // We've got trail surrogate. 312 if (leadSurrogate !== -1) { 313 uCode = 0x10000 + (leadSurrogate - 0xD800) * 0x400 + (uCode - 0xDC00); 314 leadSurrogate = -1; 315 } else { 316 // Incomplete surrogate pair - only trail surrogate found. 317 uCode = UNASSIGNED; 318 } 319 320 } 321 } 322 else if (leadSurrogate !== -1) { 323 // Incomplete surrogate pair - only lead surrogate found. 324 nextChar = uCode; uCode = UNASSIGNED; // Write an error, then current char. 325 leadSurrogate = -1; 326 } 327 328 // 2. Convert uCode character. 329 var dbcsCode = UNASSIGNED; 330 if (seqObj !== undefined && uCode != UNASSIGNED) { // We are in the middle of the sequence 331 var resCode = seqObj[uCode]; 332 if (typeof resCode === 'object') { // Sequence continues. 333 seqObj = resCode; 334 continue; 335 336 } else if (typeof resCode == 'number') { // Sequence finished. Write it. 337 dbcsCode = resCode; 338 339 } else if (resCode == undefined) { // Current character is not part of the sequence. 340 341 // Try default character for this sequence 342 resCode = seqObj[DEF_CHAR]; 343 if (resCode !== undefined) { 344 dbcsCode = resCode; // Found. Write it. 345 nextChar = uCode; // Current character will be written too in the next iteration. 346 347 } else { 348 // TODO: What if we have no default? (resCode == undefined) 349 // Then, we should write first char of the sequence as-is and try the rest recursively. 350 // Didn't do it for now because no encoding has this situation yet. 351 // Currently, just skip the sequence and write current char. 352 } 353 } 354 seqObj = undefined; 355 } 356 else if (uCode >= 0) { // Regular character 357 var subtable = this.encodeTable[uCode >> 8]; 358 if (subtable !== undefined) 359 dbcsCode = subtable[uCode & 0xFF]; 360 361 if (dbcsCode <= SEQ_START) { // Sequence start 362 seqObj = this.encodeTableSeq[SEQ_START-dbcsCode]; 363 continue; 364 } 365 366 if (dbcsCode == UNASSIGNED && this.gb18030) { 367 // Use GB18030 algorithm to find character(s) to write. 368 var idx = findIdx(this.gb18030.uChars, uCode); 369 if (idx != -1) { 370 var dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]); 371 newBuf[j++] = 0x81 + Math.floor(dbcsCode / 12600); dbcsCode = dbcsCode % 12600; 372 newBuf[j++] = 0x30 + Math.floor(dbcsCode / 1260); dbcsCode = dbcsCode % 1260; 373 newBuf[j++] = 0x81 + Math.floor(dbcsCode / 10); dbcsCode = dbcsCode % 10; 374 newBuf[j++] = 0x30 + dbcsCode; 375 continue; 376 } 377 } 378 } 379 380 // 3. Write dbcsCode character. 381 if (dbcsCode === UNASSIGNED) 382 dbcsCode = this.defaultCharSingleByte; 383 384 if (dbcsCode < 0x100) { 385 newBuf[j++] = dbcsCode; 386 } 387 else if (dbcsCode < 0x10000) { 388 newBuf[j++] = dbcsCode >> 8; // high byte 389 newBuf[j++] = dbcsCode & 0xFF; // low byte 390 } 391 else { 392 newBuf[j++] = dbcsCode >> 16; 393 newBuf[j++] = (dbcsCode >> 8) & 0xFF; 394 newBuf[j++] = dbcsCode & 0xFF; 395 } 396 } 397 398 this.seqObj = seqObj; 399 this.leadSurrogate = leadSurrogate; 400 return newBuf.slice(0, j); 401 } 402 403 DBCSEncoder.prototype.end = function() { 404 if (this.leadSurrogate === -1 && this.seqObj === undefined) 405 return; // All clean. Most often case. 406 407 var newBuf = Buffer.alloc(10), j = 0; 408 409 if (this.seqObj) { // We're in the sequence. 410 var dbcsCode = this.seqObj[DEF_CHAR]; 411 if (dbcsCode !== undefined) { // Write beginning of the sequence. 412 if (dbcsCode < 0x100) { 413 newBuf[j++] = dbcsCode; 414 } 415 else { 416 newBuf[j++] = dbcsCode >> 8; // high byte 417 newBuf[j++] = dbcsCode & 0xFF; // low byte 418 } 419 } else { 420 // See todo above. 421 } 422 this.seqObj = undefined; 423 } 424 425 if (this.leadSurrogate !== -1) { 426 // Incomplete surrogate pair - only lead surrogate found. 427 newBuf[j++] = this.defaultCharSingleByte; 428 this.leadSurrogate = -1; 429 } 430 431 return newBuf.slice(0, j); 432 } 433 434 // Export for testing 435 DBCSEncoder.prototype.findIdx = findIdx; 436 437 438 // == Decoder ================================================================== 439 440 function DBCSDecoder(options, codec) { 441 // Decoder state 442 this.nodeIdx = 0; 443 this.prevBuf = Buffer.alloc(0); 444 445 // Static data 446 this.decodeTables = codec.decodeTables; 447 this.decodeTableSeq = codec.decodeTableSeq; 448 this.defaultCharUnicode = codec.defaultCharUnicode; 449 this.gb18030 = codec.gb18030; 450 } 451 452 DBCSDecoder.prototype.write = function(buf) { 453 var newBuf = Buffer.alloc(buf.length*2), 454 nodeIdx = this.nodeIdx, 455 prevBuf = this.prevBuf, prevBufOffset = this.prevBuf.length, 456 seqStart = -this.prevBuf.length, // idx of the start of current parsed sequence. 457 uCode; 458 459 if (prevBufOffset > 0) // Make prev buf overlap a little to make it easier to slice later. 460 prevBuf = Buffer.concat([prevBuf, buf.slice(0, 10)]); 461 462 for (var i = 0, j = 0; i < buf.length; i++) { 463 var curByte = (i >= 0) ? buf[i] : prevBuf[i + prevBufOffset]; 464 465 // Lookup in current trie node. 466 var uCode = this.decodeTables[nodeIdx][curByte]; 467 468 if (uCode >= 0) { 469 // Normal character, just use it. 470 } 471 else if (uCode === UNASSIGNED) { // Unknown char. 472 // TODO: Callback with seq. 473 //var curSeq = (seqStart >= 0) ? buf.slice(seqStart, i+1) : prevBuf.slice(seqStart + prevBufOffset, i+1 + prevBufOffset); 474 i = seqStart; // Try to parse again, after skipping first byte of the sequence ('i' will be incremented by 'for' cycle). 475 uCode = this.defaultCharUnicode.charCodeAt(0); 476 } 477 else if (uCode === GB18030_CODE) { 478 var curSeq = (seqStart >= 0) ? buf.slice(seqStart, i+1) : prevBuf.slice(seqStart + prevBufOffset, i+1 + prevBufOffset); 479 var ptr = (curSeq[0]-0x81)*12600 + (curSeq[1]-0x30)*1260 + (curSeq[2]-0x81)*10 + (curSeq[3]-0x30); 480 var idx = findIdx(this.gb18030.gbChars, ptr); 481 uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx]; 482 } 483 else if (uCode <= NODE_START) { // Go to next trie node. 484 nodeIdx = NODE_START - uCode; 485 continue; 486 } 487 else if (uCode <= SEQ_START) { // Output a sequence of chars. 488 var seq = this.decodeTableSeq[SEQ_START - uCode]; 489 for (var k = 0; k < seq.length - 1; k++) { 490 uCode = seq[k]; 491 newBuf[j++] = uCode & 0xFF; 492 newBuf[j++] = uCode >> 8; 493 } 494 uCode = seq[seq.length-1]; 495 } 496 else 497 throw new Error("iconv-lite internal error: invalid decoding table value " + uCode + " at " + nodeIdx + "/" + curByte); 498 499 // Write the character to buffer, handling higher planes using surrogate pair. 500 if (uCode > 0xFFFF) { 501 uCode -= 0x10000; 502 var uCodeLead = 0xD800 + Math.floor(uCode / 0x400); 503 newBuf[j++] = uCodeLead & 0xFF; 504 newBuf[j++] = uCodeLead >> 8; 505 506 uCode = 0xDC00 + uCode % 0x400; 507 } 508 newBuf[j++] = uCode & 0xFF; 509 newBuf[j++] = uCode >> 8; 510 511 // Reset trie node. 512 nodeIdx = 0; seqStart = i+1; 513 } 514 515 this.nodeIdx = nodeIdx; 516 this.prevBuf = (seqStart >= 0) ? buf.slice(seqStart) : prevBuf.slice(seqStart + prevBufOffset); 517 return newBuf.slice(0, j).toString('ucs2'); 518 } 519 520 DBCSDecoder.prototype.end = function() { 521 var ret = ''; 522 523 // Try to parse all remaining chars. 524 while (this.prevBuf.length > 0) { 525 // Skip 1 character in the buffer. 526 ret += this.defaultCharUnicode; 527 var buf = this.prevBuf.slice(1); 528 529 // Parse remaining as usual. 530 this.prevBuf = Buffer.alloc(0); 531 this.nodeIdx = 0; 532 if (buf.length > 0) 533 ret += this.write(buf); 534 } 535 536 this.nodeIdx = 0; 537 return ret; 538 } 539 540 // Binary search for GB18030. Returns largest i such that table[i] <= val. 541 function findIdx(table, val) { 542 if (table[0] > val) 543 return -1; 544 545 var l = 0, r = table.length; 546 while (l < r-1) { // always table[l] <= val < table[r] 547 var mid = l + Math.floor((r-l+1)/2); 548 if (table[mid] <= val) 549 l = mid; 550 else 551 r = mid; 552 } 553 return l; 554 } 555