Manage scraped web content from FireCrawl pipelines. Covers content extraction filtering, HTML sanitization, markdown cleaning, structured data validation, and storage patterns for crawled content.
@mendable/firecrawl-js SDKimport FirecrawlApp from '@mendable/firecrawl-js';
const firecrawl = new FirecrawlApp({
apiKey: process.env.FIRECRAWL_API_KEY!,
});
// Scrape with controlled output formats
async function scrapeClean(url: string) {
const result = await firecrawl.scrapeUrl(url, {
formats: ['markdown'], // Markdown is cleanest for LLMs
onlyMainContent: true, // Strip nav, footer, sidebar
excludeTags: ['script', 'style', 'nav', 'footer', 'iframe'],
waitFor: 2000, # 2000: 2 seconds in ms
});
return {
markdown: cleanMarkdown(result.markdown || ''),
metadata: result.metadata,
url,
scrapedAt: new Date().toISOString(),
};
}
function cleanMarkdown(md: string): string {
return md
.replace(/\n{3,}/g, '\n\n') // Collapse multiple newlines
.replace(/\[.*?\]\(javascript:.*?\)/g, '') // Remove JS links
.replace(/!\[.*?\]\(data:.*?\)/g, '') // Remove inline data URIs
.replace(/<!--[\s\S]*?-->/g, '') // Remove HTML comments
.trim();
}
import { z } from 'zod';
const ArticleSchema = z.object({
title: z.string().min(1),
author: z.string().optional(),
publishedDate: z.string().optional(),
content: z.string().min(50),
wordCount: z.number(),
});
async function extractArticle(url: string) {
const result = await firecrawl.scrapeUrl(url, {
formats: ['extract'],
extract: {
schema: {
type: 'object',
properties: {
title: { type: 'string' },
author: { type: 'string' },
publishedDate: { type: 'string' },
content: { type: 'string' },
},
required: ['title', 'content'],
},
},
});
const extracted = result.extract;
if (!extracted) throw new Error('Extraction failed');
return ArticleSchema.parse({
...extracted,
wordCount: extracted.content?.split(/\s+/).length || 0,
});
}
import { writeFileSync, mkdirSync } from 'fs';
import { join } from 'path';
async function crawlAndStore(
baseUrl: string,
outputDir: string,
options?: { maxPages?: number; includePaths?: string[] }
) {
mkdirSync(outputDir, { recursive: true });
const crawlResult = await firecrawl.crawlUrl(baseUrl, {
limit: options?.maxPages || 50,
includePaths: options?.includePaths,
scrapeOptions: {
formats: ['markdown'],
onlyMainContent: true,
},
});
const manifest: Array<{ url: string; path: string; size: number }> = [];
for (const page of crawlResult.data || []) {
const slug = new URL(page.metadata?.sourceURL || baseUrl)
.pathname.replace(/\//g, '_').replace(/^_|_$/g, '') || 'index';
const filename = `$firecrawl-data-handling.md`;
const filePath = join(outputDir, filename);
const content = cleanMarkdown(page.markdown || '');
writeFileSync(filePath, content);
manifest.push({
url: page.metadata?.sourceURL || baseUrl,
path: filename,
size: content.length,
});
}
writeFileSync(join(outputDir, 'manifest.json'), JSON.stringify(manifest, null, 2));
return manifest;
}
import { createHash } from 'crypto';
function contentHash(text: string): string {
return createHash('sha256').update(text.trim().toLowerCase()).digest('hex');
}
function deduplicatePages(pages: Array<{ url: string; content: string }>) {
const seen = new Map<string, string>(); // hash -> url
const unique: typeof pages = [];
const duplicates: Array<{ url: string; duplicateOf: string }> = [];
for (const page of pages) {
const hash = contentHash(page.content);
if (seen.has(hash)) {
duplicates.push({ url: page.url, duplicateOf: seen.get(hash)! });
} else {
seen.set(hash, page.url);
unique.push(page);
}
}
return { unique, duplicates };
}
| Issue | Cause | Solution |
|---|---|---|
| Empty content | Dynamic JS not loaded | Increase waitFor timeout |
| Garbage in markdown | Bad HTML cleanup | Use onlyMainContent and excludeTags |
| Duplicate pages | URL aliases or redirects | Hash content for deduplication |
| Large file sizes | Full HTML stored | Use markdown format only |
const docs = await crawlAndStore('https://docs.example.com', './scraped-docs', {
maxPages: 100,
includePaths: ['/docs/*', '/api/*'],
});
console.log(`Scraped ${docs.length} pages`);