dbcs-data.js (8291B)
1 "use strict"; 2 3 // Description of supported double byte encodings and aliases. 4 // Tables are not require()-d until they are needed to speed up library load. 5 // require()-s are direct to support Browserify. 6 7 module.exports = { 8 9 // == Japanese/ShiftJIS ==================================================== 10 // All japanese encodings are based on JIS X set of standards: 11 // JIS X 0201 - Single-byte encoding of ASCII + ¥ + Kana chars at 0xA1-0xDF. 12 // JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes. 13 // Has several variations in 1978, 1983, 1990 and 1997. 14 // JIS X 0212 - Supplementary plane of 6067 chars in 94x94 plane. 1990. Effectively dead. 15 // JIS X 0213 - Extension and modern replacement of 0208 and 0212. Total chars: 11233. 16 // 2 planes, first is superset of 0208, second - revised 0212. 17 // Introduced in 2000, revised 2004. Some characters are in Unicode Plane 2 (0x2xxxx) 18 19 // Byte encodings are: 20 // * Shift_JIS: Compatible with 0201, uses not defined chars in top half as lead bytes for double-byte 21 // encoding of 0208. Lead byte ranges: 0x81-0x9F, 0xE0-0xEF; Trail byte ranges: 0x40-0x7E, 0x80-0x9E, 0x9F-0xFC. 22 // Windows CP932 is a superset of Shift_JIS. Some companies added more chars, notably KDDI. 23 // * EUC-JP: Up to 3 bytes per character. Used mostly on *nixes. 24 // 0x00-0x7F - lower part of 0201 25 // 0x8E, 0xA1-0xDF - upper part of 0201 26 // (0xA1-0xFE)x2 - 0208 plane (94x94). 27 // 0x8F, (0xA1-0xFE)x2 - 0212 plane (94x94). 28 // * JIS X 208: 7-bit, direct encoding of 0208. Byte ranges: 0x21-0x7E (94 values). Uncommon. 29 // Used as-is in ISO2022 family. 30 // * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII, 31 // 0201-1976 Roman, 0208-1978, 0208-1983. 32 // * ISO2022-JP-1: Adds esc seq for 0212-1990. 33 // * ISO2022-JP-2: Adds esc seq for GB2313-1980, KSX1001-1992, ISO8859-1, ISO8859-7. 34 // * ISO2022-JP-3: Adds esc seq for 0201-1976 Kana set, 0213-2000 Planes 1, 2. 35 // * ISO2022-JP-2004: Adds 0213-2004 Plane 1. 36 // 37 // After JIS X 0213 appeared, Shift_JIS-2004, EUC-JISX0213 and ISO2022-JP-2004 followed, with just changing the planes. 38 // 39 // Overall, it seems that it's a mess :( http://www8.plala.or.jp/tkubota1/unicode-symbols-map2.html 40 41 'shiftjis': { 42 type: '_dbcs', 43 table: function() { return require('./tables/shiftjis.json') }, 44 encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E}, 45 encodeSkipVals: [{from: 0xED40, to: 0xF940}], 46 }, 47 'csshiftjis': 'shiftjis', 48 'mskanji': 'shiftjis', 49 'sjis': 'shiftjis', 50 'windows31j': 'shiftjis', 51 'ms31j': 'shiftjis', 52 'xsjis': 'shiftjis', 53 'windows932': 'shiftjis', 54 'ms932': 'shiftjis', 55 '932': 'shiftjis', 56 'cp932': 'shiftjis', 57 58 'eucjp': { 59 type: '_dbcs', 60 table: function() { return require('./tables/eucjp.json') }, 61 encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E}, 62 }, 63 64 // TODO: KDDI extension to Shift_JIS 65 // TODO: IBM CCSID 942 = CP932, but F0-F9 custom chars and other char changes. 66 // TODO: IBM CCSID 943 = Shift_JIS = CP932 with original Shift_JIS lower 128 chars. 67 68 69 // == Chinese/GBK ========================================================== 70 // http://en.wikipedia.org/wiki/GBK 71 // We mostly implement W3C recommendation: https://www.w3.org/TR/encoding/#gbk-encoder 72 73 // Oldest GB2312 (1981, ~7600 chars) is a subset of CP936 74 'gb2312': 'cp936', 75 'gb231280': 'cp936', 76 'gb23121980': 'cp936', 77 'csgb2312': 'cp936', 78 'csiso58gb231280': 'cp936', 79 'euccn': 'cp936', 80 81 // Microsoft's CP936 is a subset and approximation of GBK. 82 'windows936': 'cp936', 83 'ms936': 'cp936', 84 '936': 'cp936', 85 'cp936': { 86 type: '_dbcs', 87 table: function() { return require('./tables/cp936.json') }, 88 }, 89 90 // GBK (~22000 chars) is an extension of CP936 that added user-mapped chars and some other. 91 'gbk': { 92 type: '_dbcs', 93 table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, 94 }, 95 'xgbk': 'gbk', 96 'isoir58': 'gbk', 97 98 // GB18030 is an algorithmic extension of GBK. 99 // Main source: https://www.w3.org/TR/encoding/#gbk-encoder 100 // http://icu-project.org/docs/papers/gb18030.html 101 // http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml 102 // http://www.khngai.com/chinese/charmap/tblgbk.php?page=0 103 'gb18030': { 104 type: '_dbcs', 105 table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, 106 gb18030: function() { return require('./tables/gb18030-ranges.json') }, 107 encodeSkipVals: [0x80], 108 encodeAdd: {'€': 0xA2E3}, 109 }, 110 111 'chinese': 'gb18030', 112 113 114 // == Korean =============================================================== 115 // EUC-KR, KS_C_5601 and KS X 1001 are exactly the same. 116 'windows949': 'cp949', 117 'ms949': 'cp949', 118 '949': 'cp949', 119 'cp949': { 120 type: '_dbcs', 121 table: function() { return require('./tables/cp949.json') }, 122 }, 123 124 'cseuckr': 'cp949', 125 'csksc56011987': 'cp949', 126 'euckr': 'cp949', 127 'isoir149': 'cp949', 128 'korean': 'cp949', 129 'ksc56011987': 'cp949', 130 'ksc56011989': 'cp949', 131 'ksc5601': 'cp949', 132 133 134 // == Big5/Taiwan/Hong Kong ================================================ 135 // There are lots of tables for Big5 and cp950. Please see the following links for history: 136 // http://moztw.org/docs/big5/ http://www.haible.de/bruno/charsets/conversion-tables/Big5.html 137 // Variations, in roughly number of defined chars: 138 // * Windows CP 950: Microsoft variant of Big5. Canonical: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT 139 // * Windows CP 951: Microsoft variant of Big5-HKSCS-2001. Seems to be never public. http://me.abelcheung.org/articles/research/what-is-cp951/ 140 // * Big5-2003 (Taiwan standard) almost superset of cp950. 141 // * Unicode-at-on (UAO) / Mozilla 1.8. Falling out of use on the Web. Not supported by other browsers. 142 // * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard. 143 // many unicode code points moved from PUA to Supplementary plane (U+2XXXX) over the years. 144 // Plus, it has 4 combining sequences. 145 // Seems that Mozilla refused to support it for 10 yrs. https://bugzilla.mozilla.org/show_bug.cgi?id=162431 https://bugzilla.mozilla.org/show_bug.cgi?id=310299 146 // because big5-hkscs is the only encoding to include astral characters in non-algorithmic way. 147 // Implementations are not consistent within browsers; sometimes labeled as just big5. 148 // MS Internet Explorer switches from big5 to big5-hkscs when a patch applied. 149 // Great discussion & recap of what's going on https://bugzilla.mozilla.org/show_bug.cgi?id=912470#c31 150 // In the encoder, it might make sense to support encoding old PUA mappings to Big5 bytes seq-s. 151 // Official spec: http://www.ogcio.gov.hk/en/business/tech_promotion/ccli/terms/doc/2003cmp_2008.txt 152 // http://www.ogcio.gov.hk/tc/business/tech_promotion/ccli/terms/doc/hkscs-2008-big5-iso.txt 153 // 154 // Current understanding of how to deal with Big5(-HKSCS) is in the Encoding Standard, http://encoding.spec.whatwg.org/#big5-encoder 155 // Unicode mapping (http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT) is said to be wrong. 156 157 'windows950': 'cp950', 158 'ms950': 'cp950', 159 '950': 'cp950', 160 'cp950': { 161 type: '_dbcs', 162 table: function() { return require('./tables/cp950.json') }, 163 }, 164 165 // Big5 has many variations and is an extension of cp950. We use Encoding Standard's as a consensus. 166 'big5': 'big5hkscs', 167 'big5hkscs': { 168 type: '_dbcs', 169 table: function() { return require('./tables/cp950.json').concat(require('./tables/big5-added.json')) }, 170 encodeSkipVals: [0xa2cc], 171 }, 172 173 'cnbig5': 'big5hkscs', 174 'csbig5': 'big5hkscs', 175 'xxbig5': 'big5hkscs', 176 };