HTML ➡️ Markdown: Converter changes for Web/JavaScript (#3843)

* markdown transform&report improvements * turn html-code into md-inlineCode * more declarative html query syntax and cover more cases * add screenshot-diff testing for markdown conversion * handle switched code/anchor nesting * escalate unhandled child tags and turn parents into HTML * add notecard support * add callouts conversion * add pretty-printing * md list refinements * add definition lists and fix spacing for inline elements * omit summary class for summarizable pages * disallow blocks in tables * merge h2m commands into a single one with different modes and to-file reporting * pretty HTML in Markdown * add keep mode to h2m cli * turn <code><var> into inlineCode * fix definition list spacing issues * only convert notecards with colons in the title * only flip code - strong/a nesting for single child elements * fix table skipping behavior * fix line-break handling * use less strict toText when getting summary * fix definition list wrapping * stop prettifying HTML as it lead to broken MD it created new line-breaks which, while still yielding working HTML, breaks in Markdown * migrate h2m to TypeScript * discard node position when converting we are not using it anyway and it makes things harder to read * centralize assertions about different node type children * improve MDN conversion report * skip non-standard header tables * AST based prettier annotation removal * convert div element's children * allow ignoring markup inside code blocks * fix nested dl handling * fix dl definiton non-block wrapping * add querying for element attribute values * reinstate HTML-in-MD prettification with quickfix * ignore <var> in <code> and turn the rest into emphasis * strip prettier-ignore from source with AST positions This marks the 3rd change of how we strip those tags, and a pendulum swing in an opposite direction of the last one. The 1st version simply string replaced all the prettier-ignore tags, which could lead to invalid ASTs (imagine a prettier ignore as a first item in a listItem, which would turn into an invalidly-empty first item space). The 2nd (aka previous) version simply dropped the HTML AST nodes which represented the ignore, but we then used mdast's stringify to turn it back into source which would mess up some of prettier's work (Hamish found that there was too much spacing around list-items). Now we could not throw it back into prettier as the input lost its prettier-ignore tags, which would lead to it wrecking our code-blocks. Hence this 3rd version is a mix of the two! * make h2m cli folder-search non-fuzzy
mdn · Jun 14, 2021 · b0dbaed · b0dbaed
1 parent 2d98119
commit b0dbaed
Show file tree

Hide file tree

Showing 29 changed files with 2,104 additions and 267 deletions.
diff --git a/content/document.js b/content/document.js
@@ -271,12 +271,6 @@ const read = memoize((folderOrFilePath, roots = ROOTS) => {
   } else {
     folder = folderOrFilePath;
     for (const possibleRoot of roots) {
-      const possibleHTMLFilePath = path.join(possibleRoot, getHTMLPath(folder));
-      if (fs.existsSync(possibleHTMLFilePath)) {
-        root = possibleRoot;
-        filePath = possibleHTMLFilePath;
-        break;
-      }
       const possibleMarkdownFilePath = path.join(
         possibleRoot,
         getMarkdownPath(folder)
@@ -286,6 +280,12 @@ const read = memoize((folderOrFilePath, roots = ROOTS) => {
         filePath = possibleMarkdownFilePath;
         break;
       }
+      const possibleHTMLFilePath = path.join(possibleRoot, getHTMLPath(folder));
+      if (fs.existsSync(possibleHTMLFilePath)) {
+        root = possibleRoot;
+        filePath = possibleHTMLFilePath;
+        break;
+      }
     }
     if (!filePath) {
       return;

diff --git a/kumascript/src/api/util.js b/kumascript/src/api/util.js
@@ -185,7 +185,7 @@ class HTMLTool {
     const result = Object.create(null);
     const sample = this.getSection(sampleID);
     // We have to wrap the collection of elements from the section
-    // we've just aquired because we're going to search among all
+    // we've just acquired because we're going to search among all
     // descendants and we want to include the elements themselves
     // as well as their descendants.
     const $ = cheerio.load(`<div>${cheerio.html(sample)}</div>`);

diff --git a/markdown/.gitignore b/markdown/.gitignore
@@ -0,0 +1 @@
+__image_snapshots__/
diff --git a/markdown/cli.js b/markdown/cli.js
diff --git a/markdown/cli.ts b/markdown/cli.ts
@@ -0,0 +1,199 @@
+import * as fs from "fs";
+const fm = require("front-matter");
+import { program } from "@caporal/core";
+import * as chalk from "chalk";
+import * as cliProgress from "cli-progress";
+import { Document } from "../content";
+
+import { h2m } from "./h2m";
+const { prettyAST } = require("./utils");
+import { m2h, withFm } from ".";
+import { toSelector } from "./h2m/utils";
+import { h } from "./h2m/h";
+
+function tryOrExit(f) {
+  return async ({
+    options = {},
+    ...args
+  }: {
+    options: { verbose?: boolean; v?: boolean };
+  }) => {
+    try {
+      await f({ options, ...args });
+    } catch (error) {
+      if (options.verbose || options.v) {
+        console.error(chalk.red(error.stack));
+      }
+      throw error;
+    }
+  };
+}
+
+program
+  .bin("yarn md")
+  .name("md")
+  .version("0.0.1")
+  .disableGlobalOption("--silent")
+  .cast(false)
+
+  .command("h2m", "Convert HTML to Markdown")
+  .option("--mode <mode>", "Mode to be run in", {
+    default: "keep",
+    validator: ["dry", "keep", "replace"],
+  })
+  .option("--print-ast", "Prints MD AST", {
+    default: false,
+    validator: program.BOOLEAN,
+  })
+  .argument("[folder]", "convert by folder")
+  .action(
+    tryOrExit(async ({ args, options }) => {
+      console.log(
+        `Starting HTML to Markdown conversion in ${options.mode} mode`
+      );
+
+      const documents = Document.findAll({ folderSearch: args.folder });
+
+      const progressBar = new cliProgress.SingleBar(
+        {},
+        cliProgress.Presets.shades_classic
+      );
+      progressBar.start(documents.count);
+
+      const problems = new Map<
+        string,
+        { offset: number; invalid: []; unhandled: [] }
+      >();
+      try {
+        for (let doc of documents.iter()) {
+          progressBar.increment();
+          if (
+            doc.isMarkdown ||
+            // findAll's folderSearch is fuzzy which we don't want here
+            !doc.metadata.slug
+              .toLowerCase()
+              .startsWith(args.folder.toLowerCase())
+          ) {
+            continue;
+          }
+          const { body: h, frontmatter } = fm(doc.rawContent);
+          const [markdown, { invalid, unhandled }] = await h2m(h, {
+            printAST: options.printAst,
+          });
+
+          if (invalid.length > 0 || unhandled.length > 0) {
+            problems.set(doc.url, {
+              offset: doc.fileInfo.frontMatterOffset,
+              invalid,
+              unhandled,
+            });
+          }
+
+          if (options.mode == "replace" || options.mode == "keep") {
+            fs.writeFileSync(
+              doc.fileInfo.path.replace(/\.html$/, ".md"),
+              withFm(frontmatter, markdown)
+            );
+            if (options.mode == "replace") {
+              fs.unlinkSync(doc.fileInfo.path);
+            }
+          }
+        }
+      } finally {
+        progressBar.stop();
+      }
+
+      const now = new Date();
+      const report = [
+        `# Report from ${now.toLocaleString()}`,
+
+        "## Top 20 unhandled elements",
+        ...Array.from(
+          Array.from(problems)
+            .flatMap(([, { invalid, unhandled }]) => [
+              ...invalid.map((e: any) => e.source),
+              ...unhandled,
+            ])
+            .map((node) =>
+              node.type == "element" ? toSelector(node) : node.type
+            )
+            .reduce(
+              (top, label) => top.set(label, (top.get(label) || 0) + 1),
+              new Map()
+            )
+        )
+          .sort(([, c1], [, c2]) => (c1 > c2 ? -1 : 1))
+          .slice(0, 20)
+          .map(([label, count]) => `- ${label} (${count})`),
+
+        "## Details per Document",
+      ];
+      let problemCount = 0;
+      for (const [url, { offset, invalid, unhandled }] of Array.from(
+        problems
+      )) {
+        problemCount += invalid.length + unhandled.length;
+        report.push(`### [${url}](https://developer.mozilla.org${url})`);
+
+        const elementWithPosition = (node) => {
+          const {
+            type,
+            position: {
+              start: { line, column },
+            },
+          } = node;
+          const label = type == "element" ? toSelector(node) : type;
+          return `${label} (${line + offset}:${column})`;
+        };
+
+        if (invalid.length > 0) {
+          report.push(
+            "#### Invalid AST transformations",
+            ...invalid.map(({ source, targetType, unexpectedChildren }: any) =>
+              [
+                `##### ${elementWithPosition(source)} => ${targetType}`,
+                "```",
+                unexpectedChildren.map((node) => prettyAST(node)),
+                "```",
+              ].join("\n")
+            )
+          );
+        }
+
+        if (unhandled.length > 0) {
+          report.push(
+            "### Missing conversion rules",
+            ...unhandled.map((node) => "- " + elementWithPosition(node))
+          );
+        }
+      }
+      if (problemCount > 0) {
+        const reportFileName = `md-conversion-problems-report-${now.toISOString()}.md`;
+        console.log(
+          `Could not automatically convert ${problemCount} elements. Saving report to ${reportFileName}`
+        );
+        fs.writeFileSync(reportFileName, report.join("\n"));
+      }
+    })
+  )
+
+  .command("m2h", "Convert Markdown to HTML")
+  .argument("[folder]", "convert by folder")
+  .action(
+    tryOrExit(async ({ args }) => {
+      const all = Document.findAll({ folderSearch: args.folder });
+      for (let doc of all.iter()) {
+        if (!doc.isMarkdown) {
+          continue;
+        }
+        const { body: m, frontmatter } = fm(doc.rawContent);
+        const h = await m2h(m);
+        fs.writeFileSync(
+          doc.fileInfo.path.replace(/\.md$/, ".html"),
+          withFm(frontmatter, h)
+        );
+      }
+    })
+  );
+
+program.run();