Tokenizer.js (16182B)
1 const { defaults } = require('./defaults.js'); 2 const { 3 rtrim, 4 splitCells, 5 escape, 6 findClosingBracket 7 } = require('./helpers.js'); 8 9 function outputLink(cap, link, raw) { 10 const href = link.href; 11 const title = link.title ? escape(link.title) : null; 12 const text = cap[1].replace(/\\([\[\]])/g, '$1'); 13 14 if (cap[0].charAt(0) !== '!') { 15 return { 16 type: 'link', 17 raw, 18 href, 19 title, 20 text 21 }; 22 } else { 23 return { 24 type: 'image', 25 raw, 26 href, 27 title, 28 text: escape(text) 29 }; 30 } 31 } 32 33 function indentCodeCompensation(raw, text) { 34 const matchIndentToCode = raw.match(/^(\s+)(?:```)/); 35 36 if (matchIndentToCode === null) { 37 return text; 38 } 39 40 const indentToCode = matchIndentToCode[1]; 41 42 return text 43 .split('\n') 44 .map(node => { 45 const matchIndentInNode = node.match(/^\s+/); 46 if (matchIndentInNode === null) { 47 return node; 48 } 49 50 const [indentInNode] = matchIndentInNode; 51 52 if (indentInNode.length >= indentToCode.length) { 53 return node.slice(indentToCode.length); 54 } 55 56 return node; 57 }) 58 .join('\n'); 59 } 60 61 /** 62 * Tokenizer 63 */ 64 module.exports = class Tokenizer { 65 constructor(options) { 66 this.options = options || defaults; 67 } 68 69 space(src) { 70 const cap = this.rules.block.newline.exec(src); 71 if (cap) { 72 if (cap[0].length > 1) { 73 return { 74 type: 'space', 75 raw: cap[0] 76 }; 77 } 78 return { raw: '\n' }; 79 } 80 } 81 82 code(src, tokens) { 83 const cap = this.rules.block.code.exec(src); 84 if (cap) { 85 const lastToken = tokens[tokens.length - 1]; 86 // An indented code block cannot interrupt a paragraph. 87 if (lastToken && lastToken.type === 'paragraph') { 88 return { 89 raw: cap[0], 90 text: cap[0].trimRight() 91 }; 92 } 93 94 const text = cap[0].replace(/^ {4}/gm, ''); 95 return { 96 type: 'code', 97 raw: cap[0], 98 codeBlockStyle: 'indented', 99 text: !this.options.pedantic 100 ? rtrim(text, '\n') 101 : text 102 }; 103 } 104 } 105 106 fences(src) { 107 const cap = this.rules.block.fences.exec(src); 108 if (cap) { 109 const raw = cap[0]; 110 const text = indentCodeCompensation(raw, cap[3] || ''); 111 112 return { 113 type: 'code', 114 raw, 115 lang: cap[2] ? cap[2].trim() : cap[2], 116 text 117 }; 118 } 119 } 120 121 heading(src) { 122 const cap = this.rules.block.heading.exec(src); 123 if (cap) { 124 return { 125 type: 'heading', 126 raw: cap[0], 127 depth: cap[1].length, 128 text: cap[2] 129 }; 130 } 131 } 132 133 nptable(src) { 134 const cap = this.rules.block.nptable.exec(src); 135 if (cap) { 136 const item = { 137 type: 'table', 138 header: splitCells(cap[1].replace(/^ *| *\| *$/g, '')), 139 align: cap[2].replace(/^ *|\| *$/g, '').split(/ *\| */), 140 cells: cap[3] ? cap[3].replace(/\n$/, '').split('\n') : [], 141 raw: cap[0] 142 }; 143 144 if (item.header.length === item.align.length) { 145 let l = item.align.length; 146 let i; 147 for (i = 0; i < l; i++) { 148 if (/^ *-+: *$/.test(item.align[i])) { 149 item.align[i] = 'right'; 150 } else if (/^ *:-+: *$/.test(item.align[i])) { 151 item.align[i] = 'center'; 152 } else if (/^ *:-+ *$/.test(item.align[i])) { 153 item.align[i] = 'left'; 154 } else { 155 item.align[i] = null; 156 } 157 } 158 159 l = item.cells.length; 160 for (i = 0; i < l; i++) { 161 item.cells[i] = splitCells(item.cells[i], item.header.length); 162 } 163 164 return item; 165 } 166 } 167 } 168 169 hr(src) { 170 const cap = this.rules.block.hr.exec(src); 171 if (cap) { 172 return { 173 type: 'hr', 174 raw: cap[0] 175 }; 176 } 177 } 178 179 blockquote(src) { 180 const cap = this.rules.block.blockquote.exec(src); 181 if (cap) { 182 const text = cap[0].replace(/^ *> ?/gm, ''); 183 184 return { 185 type: 'blockquote', 186 raw: cap[0], 187 text 188 }; 189 } 190 } 191 192 list(src) { 193 const cap = this.rules.block.list.exec(src); 194 if (cap) { 195 let raw = cap[0]; 196 const bull = cap[2]; 197 const isordered = bull.length > 1; 198 const isparen = bull[bull.length - 1] === ')'; 199 200 const list = { 201 type: 'list', 202 raw, 203 ordered: isordered, 204 start: isordered ? +bull.slice(0, -1) : '', 205 loose: false, 206 items: [] 207 }; 208 209 // Get each top-level item. 210 const itemMatch = cap[0].match(this.rules.block.item); 211 212 let next = false, 213 item, 214 space, 215 b, 216 addBack, 217 loose, 218 istask, 219 ischecked; 220 221 const l = itemMatch.length; 222 for (let i = 0; i < l; i++) { 223 item = itemMatch[i]; 224 raw = item; 225 226 // Remove the list item's bullet 227 // so it is seen as the next token. 228 space = item.length; 229 item = item.replace(/^ *([*+-]|\d+[.)]) */, ''); 230 231 // Outdent whatever the 232 // list item contains. Hacky. 233 if (~item.indexOf('\n ')) { 234 space -= item.length; 235 item = !this.options.pedantic 236 ? item.replace(new RegExp('^ {1,' + space + '}', 'gm'), '') 237 : item.replace(/^ {1,4}/gm, ''); 238 } 239 240 // Determine whether the next list item belongs here. 241 // Backpedal if it does not belong in this list. 242 if (i !== l - 1) { 243 b = this.rules.block.bullet.exec(itemMatch[i + 1])[0]; 244 if (isordered ? b.length === 1 || (!isparen && b[b.length - 1] === ')') 245 : (b.length > 1 || (this.options.smartLists && b !== bull))) { 246 addBack = itemMatch.slice(i + 1).join('\n'); 247 list.raw = list.raw.substring(0, list.raw.length - addBack.length); 248 i = l - 1; 249 } 250 } 251 252 // Determine whether item is loose or not. 253 // Use: /(^|\n)(?! )[^\n]+\n\n(?!\s*$)/ 254 // for discount behavior. 255 loose = next || /\n\n(?!\s*$)/.test(item); 256 if (i !== l - 1) { 257 next = item.charAt(item.length - 1) === '\n'; 258 if (!loose) loose = next; 259 } 260 261 if (loose) { 262 list.loose = true; 263 } 264 265 // Check for task list items 266 istask = /^\[[ xX]\] /.test(item); 267 ischecked = undefined; 268 if (istask) { 269 ischecked = item[1] !== ' '; 270 item = item.replace(/^\[[ xX]\] +/, ''); 271 } 272 273 list.items.push({ 274 type: 'list_item', 275 raw, 276 task: istask, 277 checked: ischecked, 278 loose: loose, 279 text: item 280 }); 281 } 282 283 return list; 284 } 285 } 286 287 html(src) { 288 const cap = this.rules.block.html.exec(src); 289 if (cap) { 290 return { 291 type: this.options.sanitize 292 ? 'paragraph' 293 : 'html', 294 raw: cap[0], 295 pre: !this.options.sanitizer 296 && (cap[1] === 'pre' || cap[1] === 'script' || cap[1] === 'style'), 297 text: this.options.sanitize ? (this.options.sanitizer ? this.options.sanitizer(cap[0]) : escape(cap[0])) : cap[0] 298 }; 299 } 300 } 301 302 def(src) { 303 const cap = this.rules.block.def.exec(src); 304 if (cap) { 305 if (cap[3]) cap[3] = cap[3].substring(1, cap[3].length - 1); 306 const tag = cap[1].toLowerCase().replace(/\s+/g, ' '); 307 return { 308 tag, 309 raw: cap[0], 310 href: cap[2], 311 title: cap[3] 312 }; 313 } 314 } 315 316 table(src) { 317 const cap = this.rules.block.table.exec(src); 318 if (cap) { 319 const item = { 320 type: 'table', 321 header: splitCells(cap[1].replace(/^ *| *\| *$/g, '')), 322 align: cap[2].replace(/^ *|\| *$/g, '').split(/ *\| */), 323 cells: cap[3] ? cap[3].replace(/\n$/, '').split('\n') : [] 324 }; 325 326 if (item.header.length === item.align.length) { 327 item.raw = cap[0]; 328 329 let l = item.align.length; 330 let i; 331 for (i = 0; i < l; i++) { 332 if (/^ *-+: *$/.test(item.align[i])) { 333 item.align[i] = 'right'; 334 } else if (/^ *:-+: *$/.test(item.align[i])) { 335 item.align[i] = 'center'; 336 } else if (/^ *:-+ *$/.test(item.align[i])) { 337 item.align[i] = 'left'; 338 } else { 339 item.align[i] = null; 340 } 341 } 342 343 l = item.cells.length; 344 for (i = 0; i < l; i++) { 345 item.cells[i] = splitCells( 346 item.cells[i].replace(/^ *\| *| *\| *$/g, ''), 347 item.header.length); 348 } 349 350 return item; 351 } 352 } 353 } 354 355 lheading(src) { 356 const cap = this.rules.block.lheading.exec(src); 357 if (cap) { 358 return { 359 type: 'heading', 360 raw: cap[0], 361 depth: cap[2].charAt(0) === '=' ? 1 : 2, 362 text: cap[1] 363 }; 364 } 365 } 366 367 paragraph(src) { 368 const cap = this.rules.block.paragraph.exec(src); 369 if (cap) { 370 return { 371 type: 'paragraph', 372 raw: cap[0], 373 text: cap[1].charAt(cap[1].length - 1) === '\n' 374 ? cap[1].slice(0, -1) 375 : cap[1] 376 }; 377 } 378 } 379 380 text(src, tokens) { 381 const cap = this.rules.block.text.exec(src); 382 if (cap) { 383 const lastToken = tokens[tokens.length - 1]; 384 if (lastToken && lastToken.type === 'text') { 385 return { 386 raw: cap[0], 387 text: cap[0] 388 }; 389 } 390 391 return { 392 type: 'text', 393 raw: cap[0], 394 text: cap[0] 395 }; 396 } 397 } 398 399 escape(src) { 400 const cap = this.rules.inline.escape.exec(src); 401 if (cap) { 402 return { 403 type: 'escape', 404 raw: cap[0], 405 text: escape(cap[1]) 406 }; 407 } 408 } 409 410 tag(src, inLink, inRawBlock) { 411 const cap = this.rules.inline.tag.exec(src); 412 if (cap) { 413 if (!inLink && /^<a /i.test(cap[0])) { 414 inLink = true; 415 } else if (inLink && /^<\/a>/i.test(cap[0])) { 416 inLink = false; 417 } 418 if (!inRawBlock && /^<(pre|code|kbd|script)(\s|>)/i.test(cap[0])) { 419 inRawBlock = true; 420 } else if (inRawBlock && /^<\/(pre|code|kbd|script)(\s|>)/i.test(cap[0])) { 421 inRawBlock = false; 422 } 423 424 return { 425 type: this.options.sanitize 426 ? 'text' 427 : 'html', 428 raw: cap[0], 429 inLink, 430 inRawBlock, 431 text: this.options.sanitize 432 ? (this.options.sanitizer 433 ? this.options.sanitizer(cap[0]) 434 : escape(cap[0])) 435 : cap[0] 436 }; 437 } 438 } 439 440 link(src) { 441 const cap = this.rules.inline.link.exec(src); 442 if (cap) { 443 const lastParenIndex = findClosingBracket(cap[2], '()'); 444 if (lastParenIndex > -1) { 445 const start = cap[0].indexOf('!') === 0 ? 5 : 4; 446 const linkLen = start + cap[1].length + lastParenIndex; 447 cap[2] = cap[2].substring(0, lastParenIndex); 448 cap[0] = cap[0].substring(0, linkLen).trim(); 449 cap[3] = ''; 450 } 451 let href = cap[2]; 452 let title = ''; 453 if (this.options.pedantic) { 454 const link = /^([^'"]*[^\s])\s+(['"])(.*)\2/.exec(href); 455 456 if (link) { 457 href = link[1]; 458 title = link[3]; 459 } else { 460 title = ''; 461 } 462 } else { 463 title = cap[3] ? cap[3].slice(1, -1) : ''; 464 } 465 href = href.trim().replace(/^<([\s\S]*)>$/, '$1'); 466 const token = outputLink(cap, { 467 href: href ? href.replace(this.rules.inline._escapes, '$1') : href, 468 title: title ? title.replace(this.rules.inline._escapes, '$1') : title 469 }, cap[0]); 470 return token; 471 } 472 } 473 474 reflink(src, links) { 475 let cap; 476 if ((cap = this.rules.inline.reflink.exec(src)) 477 || (cap = this.rules.inline.nolink.exec(src))) { 478 let link = (cap[2] || cap[1]).replace(/\s+/g, ' '); 479 link = links[link.toLowerCase()]; 480 if (!link || !link.href) { 481 const text = cap[0].charAt(0); 482 return { 483 type: 'text', 484 raw: text, 485 text 486 }; 487 } 488 const token = outputLink(cap, link, cap[0]); 489 return token; 490 } 491 } 492 493 strong(src, maskedSrc, prevChar = '') { 494 let match = this.rules.inline.strong.start.exec(src); 495 496 if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) { 497 maskedSrc = maskedSrc.slice(-1 * src.length); 498 const endReg = match[0] === '**' ? this.rules.inline.strong.endAst : this.rules.inline.strong.endUnd; 499 500 endReg.lastIndex = 0; 501 502 let cap; 503 while ((match = endReg.exec(maskedSrc)) != null) { 504 cap = this.rules.inline.strong.middle.exec(maskedSrc.slice(0, match.index + 3)); 505 if (cap) { 506 return { 507 type: 'strong', 508 raw: src.slice(0, cap[0].length), 509 text: src.slice(2, cap[0].length - 2) 510 }; 511 } 512 } 513 } 514 } 515 516 em(src, maskedSrc, prevChar = '') { 517 let match = this.rules.inline.em.start.exec(src); 518 519 if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) { 520 maskedSrc = maskedSrc.slice(-1 * src.length); 521 const endReg = match[0] === '*' ? this.rules.inline.em.endAst : this.rules.inline.em.endUnd; 522 523 endReg.lastIndex = 0; 524 525 let cap; 526 while ((match = endReg.exec(maskedSrc)) != null) { 527 cap = this.rules.inline.em.middle.exec(maskedSrc.slice(0, match.index + 2)); 528 if (cap) { 529 return { 530 type: 'em', 531 raw: src.slice(0, cap[0].length), 532 text: src.slice(1, cap[0].length - 1) 533 }; 534 } 535 } 536 } 537 } 538 539 codespan(src) { 540 const cap = this.rules.inline.code.exec(src); 541 if (cap) { 542 let text = cap[2].replace(/\n/g, ' '); 543 const hasNonSpaceChars = /[^ ]/.test(text); 544 const hasSpaceCharsOnBothEnds = text.startsWith(' ') && text.endsWith(' '); 545 if (hasNonSpaceChars && hasSpaceCharsOnBothEnds) { 546 text = text.substring(1, text.length - 1); 547 } 548 text = escape(text, true); 549 return { 550 type: 'codespan', 551 raw: cap[0], 552 text 553 }; 554 } 555 } 556 557 br(src) { 558 const cap = this.rules.inline.br.exec(src); 559 if (cap) { 560 return { 561 type: 'br', 562 raw: cap[0] 563 }; 564 } 565 } 566 567 del(src) { 568 const cap = this.rules.inline.del.exec(src); 569 if (cap) { 570 return { 571 type: 'del', 572 raw: cap[0], 573 text: cap[1] 574 }; 575 } 576 } 577 578 autolink(src, mangle) { 579 const cap = this.rules.inline.autolink.exec(src); 580 if (cap) { 581 let text, href; 582 if (cap[2] === '@') { 583 text = escape(this.options.mangle ? mangle(cap[1]) : cap[1]); 584 href = 'mailto:' + text; 585 } else { 586 text = escape(cap[1]); 587 href = text; 588 } 589 590 return { 591 type: 'link', 592 raw: cap[0], 593 text, 594 href, 595 tokens: [ 596 { 597 type: 'text', 598 raw: text, 599 text 600 } 601 ] 602 }; 603 } 604 } 605 606 url(src, mangle) { 607 let cap; 608 if (cap = this.rules.inline.url.exec(src)) { 609 let text, href; 610 if (cap[2] === '@') { 611 text = escape(this.options.mangle ? mangle(cap[0]) : cap[0]); 612 href = 'mailto:' + text; 613 } else { 614 // do extended autolink path validation 615 let prevCapZero; 616 do { 617 prevCapZero = cap[0]; 618 cap[0] = this.rules.inline._backpedal.exec(cap[0])[0]; 619 } while (prevCapZero !== cap[0]); 620 text = escape(cap[0]); 621 if (cap[1] === 'www.') { 622 href = 'http://' + text; 623 } else { 624 href = text; 625 } 626 } 627 return { 628 type: 'link', 629 raw: cap[0], 630 text, 631 href, 632 tokens: [ 633 { 634 type: 'text', 635 raw: text, 636 text 637 } 638 ] 639 }; 640 } 641 } 642 643 inlineText(src, inRawBlock, smartypants) { 644 const cap = this.rules.inline.text.exec(src); 645 if (cap) { 646 let text; 647 if (inRawBlock) { 648 text = this.options.sanitize ? (this.options.sanitizer ? this.options.sanitizer(cap[0]) : escape(cap[0])) : cap[0]; 649 } else { 650 text = escape(this.options.smartypants ? smartypants(cap[0]) : cap[0]); 651 } 652 return { 653 type: 'text', 654 raw: cap[0], 655 text 656 }; 657 } 658 } 659 };