-
Notifications
You must be signed in to change notification settings - Fork 522
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
HTML ➡️ Markdown: Converter changes for Web/JavaScript (#3843)
* markdown transform&report improvements * turn html-code into md-inlineCode * more declarative html query syntax and cover more cases * add screenshot-diff testing for markdown conversion * handle switched code/anchor nesting * escalate unhandled child tags and turn parents into HTML * add notecard support * add callouts conversion * add pretty-printing * md list refinements * add definition lists and fix spacing for inline elements * omit summary class for summarizable pages * disallow blocks in tables * merge h2m commands into a single one with different modes and to-file reporting * pretty HTML in Markdown * add keep mode to h2m cli * turn <code><var> into inlineCode * fix definition list spacing issues * only convert notecards with colons in the title * only flip code - strong/a nesting for single child elements * fix table skipping behavior * fix line-break handling * use less strict toText when getting summary * fix definition list wrapping * stop prettifying HTML as it lead to broken MD it created new line-breaks which, while still yielding working HTML, breaks in Markdown * migrate h2m to TypeScript * discard node position when converting we are not using it anyway and it makes things harder to read * centralize assertions about different node type children * improve MDN conversion report * skip non-standard header tables * AST based prettier annotation removal * convert div element's children * allow ignoring markup inside code blocks * fix nested dl handling * fix dl definiton non-block wrapping * add querying for element attribute values * reinstate HTML-in-MD prettification with quickfix * ignore <var> in <code> and turn the rest into emphasis * strip prettier-ignore from source with AST positions This marks the 3rd change of how we strip those tags, and a pendulum swing in an opposite direction of the last one. The 1st version simply string replaced all the prettier-ignore tags, which could lead to invalid ASTs (imagine a prettier ignore as a first item in a listItem, which would turn into an invalidly-empty first item space). The 2nd (aka previous) version simply dropped the HTML AST nodes which represented the ignore, but we then used mdast's stringify to turn it back into source which would mess up some of prettier's work (Hamish found that there was too much spacing around list-items). Now we could not throw it back into prettier as the input lost its prettier-ignore tags, which would lead to it wrecking our code-blocks. Hence this 3rd version is a mix of the two! * make h2m cli folder-search non-fuzzy
- Loading branch information
Showing
29 changed files
with
2,104 additions
and
267 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__image_snapshots__/ |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,199 @@ | ||
import * as fs from "fs"; | ||
const fm = require("front-matter"); | ||
import { program } from "@caporal/core"; | ||
import * as chalk from "chalk"; | ||
import * as cliProgress from "cli-progress"; | ||
import { Document } from "../content"; | ||
|
||
import { h2m } from "./h2m"; | ||
const { prettyAST } = require("./utils"); | ||
import { m2h, withFm } from "."; | ||
import { toSelector } from "./h2m/utils"; | ||
import { h } from "./h2m/h"; | ||
|
||
function tryOrExit(f) { | ||
return async ({ | ||
options = {}, | ||
...args | ||
}: { | ||
options: { verbose?: boolean; v?: boolean }; | ||
}) => { | ||
try { | ||
await f({ options, ...args }); | ||
} catch (error) { | ||
if (options.verbose || options.v) { | ||
console.error(chalk.red(error.stack)); | ||
} | ||
throw error; | ||
} | ||
}; | ||
} | ||
|
||
program | ||
.bin("yarn md") | ||
.name("md") | ||
.version("0.0.1") | ||
.disableGlobalOption("--silent") | ||
.cast(false) | ||
|
||
.command("h2m", "Convert HTML to Markdown") | ||
.option("--mode <mode>", "Mode to be run in", { | ||
default: "keep", | ||
validator: ["dry", "keep", "replace"], | ||
}) | ||
.option("--print-ast", "Prints MD AST", { | ||
default: false, | ||
validator: program.BOOLEAN, | ||
}) | ||
.argument("[folder]", "convert by folder") | ||
.action( | ||
tryOrExit(async ({ args, options }) => { | ||
console.log( | ||
`Starting HTML to Markdown conversion in ${options.mode} mode` | ||
); | ||
|
||
const documents = Document.findAll({ folderSearch: args.folder }); | ||
|
||
const progressBar = new cliProgress.SingleBar( | ||
{}, | ||
cliProgress.Presets.shades_classic | ||
); | ||
progressBar.start(documents.count); | ||
|
||
const problems = new Map< | ||
string, | ||
{ offset: number; invalid: []; unhandled: [] } | ||
>(); | ||
try { | ||
for (let doc of documents.iter()) { | ||
progressBar.increment(); | ||
if ( | ||
doc.isMarkdown || | ||
// findAll's folderSearch is fuzzy which we don't want here | ||
!doc.metadata.slug | ||
.toLowerCase() | ||
.startsWith(args.folder.toLowerCase()) | ||
) { | ||
continue; | ||
} | ||
const { body: h, frontmatter } = fm(doc.rawContent); | ||
const [markdown, { invalid, unhandled }] = await h2m(h, { | ||
printAST: options.printAst, | ||
}); | ||
|
||
if (invalid.length > 0 || unhandled.length > 0) { | ||
problems.set(doc.url, { | ||
offset: doc.fileInfo.frontMatterOffset, | ||
invalid, | ||
unhandled, | ||
}); | ||
} | ||
|
||
if (options.mode == "replace" || options.mode == "keep") { | ||
fs.writeFileSync( | ||
doc.fileInfo.path.replace(/\.html$/, ".md"), | ||
withFm(frontmatter, markdown) | ||
); | ||
if (options.mode == "replace") { | ||
fs.unlinkSync(doc.fileInfo.path); | ||
} | ||
} | ||
} | ||
} finally { | ||
progressBar.stop(); | ||
} | ||
|
||
const now = new Date(); | ||
const report = [ | ||
`# Report from ${now.toLocaleString()}`, | ||
|
||
"## Top 20 unhandled elements", | ||
...Array.from( | ||
Array.from(problems) | ||
.flatMap(([, { invalid, unhandled }]) => [ | ||
...invalid.map((e: any) => e.source), | ||
...unhandled, | ||
]) | ||
.map((node) => | ||
node.type == "element" ? toSelector(node) : node.type | ||
) | ||
.reduce( | ||
(top, label) => top.set(label, (top.get(label) || 0) + 1), | ||
new Map() | ||
) | ||
) | ||
.sort(([, c1], [, c2]) => (c1 > c2 ? -1 : 1)) | ||
.slice(0, 20) | ||
.map(([label, count]) => `- ${label} (${count})`), | ||
|
||
"## Details per Document", | ||
]; | ||
let problemCount = 0; | ||
for (const [url, { offset, invalid, unhandled }] of Array.from( | ||
problems | ||
)) { | ||
problemCount += invalid.length + unhandled.length; | ||
report.push(`### [${url}](https://developer.mozilla.org${url})`); | ||
|
||
const elementWithPosition = (node) => { | ||
const { | ||
type, | ||
position: { | ||
start: { line, column }, | ||
}, | ||
} = node; | ||
const label = type == "element" ? toSelector(node) : type; | ||
return `${label} (${line + offset}:${column})`; | ||
}; | ||
|
||
if (invalid.length > 0) { | ||
report.push( | ||
"#### Invalid AST transformations", | ||
...invalid.map(({ source, targetType, unexpectedChildren }: any) => | ||
[ | ||
`##### ${elementWithPosition(source)} => ${targetType}`, | ||
"```", | ||
unexpectedChildren.map((node) => prettyAST(node)), | ||
"```", | ||
].join("\n") | ||
) | ||
); | ||
} | ||
|
||
if (unhandled.length > 0) { | ||
report.push( | ||
"### Missing conversion rules", | ||
...unhandled.map((node) => "- " + elementWithPosition(node)) | ||
); | ||
} | ||
} | ||
if (problemCount > 0) { | ||
const reportFileName = `md-conversion-problems-report-${now.toISOString()}.md`; | ||
console.log( | ||
`Could not automatically convert ${problemCount} elements. Saving report to ${reportFileName}` | ||
); | ||
fs.writeFileSync(reportFileName, report.join("\n")); | ||
} | ||
}) | ||
) | ||
|
||
.command("m2h", "Convert Markdown to HTML") | ||
.argument("[folder]", "convert by folder") | ||
.action( | ||
tryOrExit(async ({ args }) => { | ||
const all = Document.findAll({ folderSearch: args.folder }); | ||
for (let doc of all.iter()) { | ||
if (!doc.isMarkdown) { | ||
continue; | ||
} | ||
const { body: m, frontmatter } = fm(doc.rawContent); | ||
const h = await m2h(m); | ||
fs.writeFileSync( | ||
doc.fileInfo.path.replace(/\.md$/, ".html"), | ||
withFm(frontmatter, h) | ||
); | ||
} | ||
}) | ||
); | ||
|
||
program.run(); |
Oops, something went wrong.