twitst4tz

twitter statistics web application
Log | Files | Refs | README | LICENSE

utf16.js (5011B)


      1 "use strict";
      2 var Buffer = require("safer-buffer").Buffer;
      3 
      4 // Note: UTF16-LE (or UCS2) codec is Node.js native. See encodings/internal.js
      5 
      6 // == UTF16-BE codec. ==========================================================
      7 
      8 exports.utf16be = Utf16BECodec;
      9 function Utf16BECodec() {
     10 }
     11 
     12 Utf16BECodec.prototype.encoder = Utf16BEEncoder;
     13 Utf16BECodec.prototype.decoder = Utf16BEDecoder;
     14 Utf16BECodec.prototype.bomAware = true;
     15 
     16 
     17 // -- Encoding
     18 
     19 function Utf16BEEncoder() {
     20 }
     21 
     22 Utf16BEEncoder.prototype.write = function(str) {
     23     var buf = Buffer.from(str, 'ucs2');
     24     for (var i = 0; i < buf.length; i += 2) {
     25         var tmp = buf[i]; buf[i] = buf[i+1]; buf[i+1] = tmp;
     26     }
     27     return buf;
     28 }
     29 
     30 Utf16BEEncoder.prototype.end = function() {
     31 }
     32 
     33 
     34 // -- Decoding
     35 
     36 function Utf16BEDecoder() {
     37     this.overflowByte = -1;
     38 }
     39 
     40 Utf16BEDecoder.prototype.write = function(buf) {
     41     if (buf.length == 0)
     42         return '';
     43 
     44     var buf2 = Buffer.alloc(buf.length + 1),
     45         i = 0, j = 0;
     46 
     47     if (this.overflowByte !== -1) {
     48         buf2[0] = buf[0];
     49         buf2[1] = this.overflowByte;
     50         i = 1; j = 2;
     51     }
     52 
     53     for (; i < buf.length-1; i += 2, j+= 2) {
     54         buf2[j] = buf[i+1];
     55         buf2[j+1] = buf[i];
     56     }
     57 
     58     this.overflowByte = (i == buf.length-1) ? buf[buf.length-1] : -1;
     59 
     60     return buf2.slice(0, j).toString('ucs2');
     61 }
     62 
     63 Utf16BEDecoder.prototype.end = function() {
     64 }
     65 
     66 
     67 // == UTF-16 codec =============================================================
     68 // Decoder chooses automatically from UTF-16LE and UTF-16BE using BOM and space-based heuristic.
     69 // Defaults to UTF-16LE, as it's prevalent and default in Node.
     70 // http://en.wikipedia.org/wiki/UTF-16 and http://encoding.spec.whatwg.org/#utf-16le
     71 // Decoder default can be changed: iconv.decode(buf, 'utf16', {defaultEncoding: 'utf-16be'});
     72 
     73 // Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false).
     74 
     75 exports.utf16 = Utf16Codec;
     76 function Utf16Codec(codecOptions, iconv) {
     77     this.iconv = iconv;
     78 }
     79 
     80 Utf16Codec.prototype.encoder = Utf16Encoder;
     81 Utf16Codec.prototype.decoder = Utf16Decoder;
     82 
     83 
     84 // -- Encoding (pass-through)
     85 
     86 function Utf16Encoder(options, codec) {
     87     options = options || {};
     88     if (options.addBOM === undefined)
     89         options.addBOM = true;
     90     this.encoder = codec.iconv.getEncoder('utf-16le', options);
     91 }
     92 
     93 Utf16Encoder.prototype.write = function(str) {
     94     return this.encoder.write(str);
     95 }
     96 
     97 Utf16Encoder.prototype.end = function() {
     98     return this.encoder.end();
     99 }
    100 
    101 
    102 // -- Decoding
    103 
    104 function Utf16Decoder(options, codec) {
    105     this.decoder = null;
    106     this.initialBytes = [];
    107     this.initialBytesLen = 0;
    108 
    109     this.options = options || {};
    110     this.iconv = codec.iconv;
    111 }
    112 
    113 Utf16Decoder.prototype.write = function(buf) {
    114     if (!this.decoder) {
    115         // Codec is not chosen yet. Accumulate initial bytes.
    116         this.initialBytes.push(buf);
    117         this.initialBytesLen += buf.length;
    118         
    119         if (this.initialBytesLen < 16) // We need more bytes to use space heuristic (see below)
    120             return '';
    121 
    122         // We have enough bytes -> detect endianness.
    123         var buf = Buffer.concat(this.initialBytes),
    124             encoding = detectEncoding(buf, this.options.defaultEncoding);
    125         this.decoder = this.iconv.getDecoder(encoding, this.options);
    126         this.initialBytes.length = this.initialBytesLen = 0;
    127     }
    128 
    129     return this.decoder.write(buf);
    130 }
    131 
    132 Utf16Decoder.prototype.end = function() {
    133     if (!this.decoder) {
    134         var buf = Buffer.concat(this.initialBytes),
    135             encoding = detectEncoding(buf, this.options.defaultEncoding);
    136         this.decoder = this.iconv.getDecoder(encoding, this.options);
    137 
    138         var res = this.decoder.write(buf),
    139             trail = this.decoder.end();
    140 
    141         return trail ? (res + trail) : res;
    142     }
    143     return this.decoder.end();
    144 }
    145 
    146 function detectEncoding(buf, defaultEncoding) {
    147     var enc = defaultEncoding || 'utf-16le';
    148 
    149     if (buf.length >= 2) {
    150         // Check BOM.
    151         if (buf[0] == 0xFE && buf[1] == 0xFF) // UTF-16BE BOM
    152             enc = 'utf-16be';
    153         else if (buf[0] == 0xFF && buf[1] == 0xFE) // UTF-16LE BOM
    154             enc = 'utf-16le';
    155         else {
    156             // No BOM found. Try to deduce encoding from initial content.
    157             // Most of the time, the content has ASCII chars (U+00**), but the opposite (U+**00) is uncommon.
    158             // So, we count ASCII as if it was LE or BE, and decide from that.
    159             var asciiCharsLE = 0, asciiCharsBE = 0, // Counts of chars in both positions
    160                 _len = Math.min(buf.length - (buf.length % 2), 64); // Len is always even.
    161 
    162             for (var i = 0; i < _len; i += 2) {
    163                 if (buf[i] === 0 && buf[i+1] !== 0) asciiCharsBE++;
    164                 if (buf[i] !== 0 && buf[i+1] === 0) asciiCharsLE++;
    165             }
    166 
    167             if (asciiCharsBE > asciiCharsLE)
    168                 enc = 'utf-16be';
    169             else if (asciiCharsBE < asciiCharsLE)
    170                 enc = 'utf-16le';
    171         }
    172     }
    173 
    174     return enc;
    175 }
    176 
    177