blob: cc5964e8fe54c5bf31f90cd22b501853ef93ca06 [file] [log] [blame]
const fs = require('fs');
const { parse } = require('node-html-parser');
const { createTurndownService } = require('../helpers/turndown-config');
const { generateToonSitemaps } = require('../helpers/toon-format');
const { generateLlmsTxt } = require('../helpers/llms-txt');
const { generateReleasesIndex, generateBlogIndex } = require('../helpers/rss-feed');
const { generateAllIndexes } = require('../helpers/html-index');
/**
* Generates Markdown (.md) files from HTML files for LLM consumption.
* This task converts HTML documentation pages to Markdown format, making them
* accessible to LLMs as per https://llmstxt.org/ specification.
*
* For each .html file, it creates a corresponding .md file with:
* - Only the main article content (excluding nav, header, footer)
* - Clean Markdown formatting using Turndown
* - GitHub-flavored Markdown for tables and code blocks
*/
async function generateMarkdown(done) {
const turndownService = createTurndownService();
// Keep track of processed files for llms.txt
const processedPages = [];
const glob = require('glob');
// Get all HTML files
const htmlFiles = glob.sync('public/**/*.html', {
ignore: ['public/404.html', 'public/**/index.html'] // Skip error pages and index pages
});
let processedCount = 0;
const totalFiles = htmlFiles.length;
const BATCH_SIZE = 500; // Process in batches to avoid memory issues
console.log(`Found ${totalFiles} HTML files to convert`);
// Process files in batches
for (let i = 0; i < htmlFiles.length; i += BATCH_SIZE) {
const batch = htmlFiles.slice(i, i + BATCH_SIZE);
for (const htmlFile of batch) {
try {
const htmlContent = fs.readFileSync(htmlFile, 'utf8');
const root = parse(htmlContent);
// Extract only the main article content
// Try different selectors based on Antora and Hugo structure
let mainContent = root.querySelector('article.doc') ||
root.querySelector('main') ||
root.querySelector('.article') ||
root.querySelector('article');
if (!mainContent) {
// Silently skip files without main content
continue;
}
// Remove navigation elements, headers, and footers from the content
const elementsToRemove = mainContent.querySelectorAll('nav, header, footer, .nav, .navbar, .toolbar');
elementsToRemove.forEach(el => el.remove());
// Remove anchor links (they are just UI navigation aids)
const anchors = mainContent.querySelectorAll('a.anchor');
anchors.forEach(el => el.remove());
// Clean up table cells by unwrapping div.content and div.paragraph wrappers
const tableCells = mainContent.querySelectorAll('td.tableblock, th.tableblock');
tableCells.forEach(cell => {
let html = cell.innerHTML;
// Unwrap <div class="content"><div class="paragraph"><p>...</p></div></div>
html = html.replace(/<div class="content"><div class="paragraph">\s*<p>(.*?)<\/p>\s*<\/div><\/div>/gs, '$1');
// Unwrap <div class="content"><div id="..." class="paragraph"><p>...</p></div></div>
html = html.replace(/<div class="content"><div[^>]*class="paragraph"[^>]*>\s*<p>(.*?)<\/p>\s*<\/div><\/div>/gs, '$1');
// Also handle simple <p class="tableblock">...</p> wrappers
html = html.replace(/<p class="tableblock">(.*?)<\/p>/gs, '$1');
cell.set_content(html);
});
// Convert to Markdown
let markdown = turndownService.turndown(mainContent.innerHTML);
// Update links to point to .md files instead of .html
// Replace https://camel.apache.org/**/*.html with https://camel.apache.org/**/*.md
markdown = markdown.replace(/(https:\/\/camel\.apache\.org\/[^)\s]*?)\.html/g, '$1.md');
// Replace relative links *.html with *.md
markdown = markdown.replace(/\[([^\]]+)\]\(([^)]+?)\.html\)/g, '[$1]($2.md)');
// Write .md file (replace .html extension with .md)
const mdFile = htmlFile.replace(/\.html$/, '.md');
fs.writeFileSync(mdFile, markdown, 'utf8');
// Track for llms.txt (convert to URL path)
const urlPath = htmlFile.replace('public/', '/').replace('.html', '.md');
processedPages.push(urlPath);
processedCount++;
} catch (error) {
console.error(`Error processing ${htmlFile}:`, error.message);
}
}
}
console.log(`\nSuccessfully generated ${processedCount} Markdown files`);
// Generate llms.txt file
generateLlmsTxt(processedPages);
// Generate toon format sitemaps
await generateToonSitemaps();
// Generate toon format for releases RSS feed
await generateReleasesIndex();
// Generate toon format for blog RSS feed
await generateBlogIndex();
// Generate all other index files
await generateAllIndexes();
done();
}
module.exports = generateMarkdown;