Production architecture for web scraping and data extraction with FireCrawl. Covers crawl job orchestration, content extraction pipelines, structured data output, and site mapping workflows.
@mendable/firecrawl-js SDK installed┌─────────────────────────────────────────────────────┐
│ Crawl Orchestrator │
│ ┌──────────┐ ┌──────────┐ ┌──────────────────┐ │
│ │ Scrape │ │ Crawl │ │ Map │ │
│ │ (single) │ │ (multi) │ │ (discovery) │ │
│ └────┬─────┘ └────┬─────┘ └────────┬─────────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌──────────────────────────────────────────────┐ │
│ │ Content Processing Pipeline │ │
│ │ Markdown │ HTML │ Screenshots │ Structured │ │
│ └──────────────────────┬───────────────────────┘ │
│ │ │
│ ┌──────────────────────┴───────────────────────┐ │
│ │ Output & Storage │ │
│ │ JSON Files │ Database │ Vector Store │ S3 │ │
│ └──────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────┘
import FirecrawlApp from '@mendable/firecrawl-js';
const firecrawl = new FirecrawlApp({
apiKey: process.env.FIRECRAWL_API_KEY!,
});
// Single page scrape with markdown output
async function scrapePage(url: string) {
return firecrawl.scrapeUrl(url, {
formats: ['markdown', 'html'],
onlyMainContent: true,
waitFor: 2000, // Wait for dynamic content # 2000: 2 seconds in ms
});
}
// Structured data extraction with schema
async function extractStructured(url: string, schema: any) {
return firecrawl.scrapeUrl(url, {
formats: ['extract'],
extract: {
schema,
systemPrompt: 'Extract data precisely according to the schema.',
},
});
}
async function crawlSite(baseUrl: string, options?: {
maxPages?: number;
includePaths?: string[];
excludePaths?: string[];
}) {
const crawlResult = await firecrawl.crawlUrl(baseUrl, {
limit: options?.maxPages || 50,
includePaths: options?.includePaths,
excludePaths: options?.excludePaths || ['/blog/*', '/news/*'],
scrapeOptions: {
formats: ['markdown'],
onlyMainContent: true,
},
});
return crawlResult;
}
// Async crawl for large sites
async function asyncCrawl(baseUrl: string) {
const job = await firecrawl.asyncCrawlUrl(baseUrl, {
limit: 500, # HTTP 500 Internal Server Error
scrapeOptions: { formats: ['markdown'] },
});
// Poll for completion
let status = await firecrawl.checkCrawlStatus(job.id);
while (status.status === 'scraping') {
await new Promise(r => setTimeout(r, 5000)); # 5000: 5 seconds in ms
status = await firecrawl.checkCrawlStatus(job.id);
}
return status;
}
async function discoverSiteStructure(url: string) {
const mapResult = await firecrawl.mapUrl(url);
// Categorize discovered URLs
const structure = {
pages: mapResult.links?.filter(l => !l.includes('/api/')) || [],
apiDocs: mapResult.links?.filter(l => l.includes('/api/') || l.includes('/docs/')) || [],
blog: mapResult.links?.filter(l => l.includes('/blog/')) || [],
total: mapResult.links?.length || 0,
};
return structure;
}
import { z } from 'zod';
const ProductSchema = z.object({
name: z.string(),
price: z.number(),
description: z.string(),
features: z.array(z.string()),
availability: z.enum(['in_stock', 'out_of_stock', 'preorder']),
});
async function extractProducts(urls: string[]) {
const results = [];
for (const url of urls) {
const data = await extractStructured(url, {
type: 'object',
properties: {
name: { type: 'string' },
price: { type: 'number' },
description: { type: 'string' },
features: { type: 'array', items: { type: 'string' } },
availability: { type: 'string', enum: ['in_stock', 'out_of_stock', 'preorder'] },
},
required: ['name', 'price'],
});
results.push(data);
}
return results;
}
| Issue | Cause | Solution |
|---|---|---|
| Timeout on scrape | Dynamic JS content | Increase waitFor timeout |
| Empty markdown | Content behind paywall | Use authenticated scraping or different URL |
| Crawl incomplete | Hit page limit | Increase limit, use includePaths |
| Rate limit 429 | Too many concurrent scrapes | Add delays between requests |
async function scrapeDocumentation(docsUrl: string) {
const sitemap = await discoverSiteStructure(docsUrl);
const docPages = sitemap.apiDocs.slice(0, 100);
const crawl = await crawlSite(docsUrl, {
maxPages: 100,
includePaths: ['/docs/*', '/api/*', '/guide/*'],
});
return crawl;
}