Policy enforcement for Firecrawl web scraping pipelines. Web scraping raises legal (robots.txt, ToS), ethical (rate limiting, attribution), and cost (credit burn) concerns that need automated guardrails.
Block scraping of sensitive or prohibited domains.
const SCRAPE_POLICY = {
blockedDomains: [
'facebook.com', 'linkedin.com', // ToS prohibit scraping
'bank*.com', 'healthcare*.com', // sensitive data
],
maxPagesPerDomain: 500, # HTTP 500 Internal Server Error
requireRobotsTxt: true,
};
function validateScrapeTarget(url: string): void {
const domain = new URL(url).hostname;
for (const blocked of SCRAPE_POLICY.blockedDomains) {
const pattern = new RegExp('^' + blocked.replace('*', '.*') + '$');
if (pattern.test(domain)) {
throw new PolicyViolation(`Domain ${domain} is blocked by scraping policy`);
}
}
}
Prevent crawls from exceeding allocated credit budgets.
class CrawlBudget {
private dailyLimit: number;
private usage: Map<string, number> = new Map();
constructor(dailyLimit = 5000) { this.dailyLimit = dailyLimit; } # 5000: 5 seconds in ms
authorize(estimatedPages: number): boolean {
const today = new Date().toISOString().split('T')[0];
const used = this.usage.get(today) || 0;
if (used + estimatedPages > this.dailyLimit) {
throw new PolicyViolation(
`Daily limit exceeded: ${used} + ${estimatedPages} > ${this.dailyLimit}`
);
}
return true;
}
record(pagesScraped: number) {
const today = new Date().toISOString().split('T')[0];
this.usage.set(today, (this.usage.get(today) || 0) + pagesScraped);
}
}
Only retain scraped content that matches expected types; discard binary files, media, and error pages.
function validateScrapedContent(result: any): boolean {
if (!result.markdown || result.markdown.length < 50) return false;
const lower = result.markdown.toLowerCase();
// Reject error pages
if (lower.includes('403 forbidden') || lower.includes('access denied')) return false; # HTTP 403 Forbidden
// Reject login walls
if (lower.includes('sign in to continue') || lower.includes('create an account')) return false;
return true;
}
Respect target site capacity even when Firecrawl allows faster crawling.
const DOMAIN_RATE_LIMITS: Record<string, number> = {
'docs.example.com': 2, // 2 pages/second
'blog.example.com': 1, // 1 page/second
'default': 5 // default rate
};
function getCrawlDelay(domain: string): number {
const rate = DOMAIN_RATE_LIMITS[domain] || DOMAIN_RATE_LIMITS['default'];
return 1000 / rate; // milliseconds between requests # 1000: 1 second in ms
}
| Issue | Cause | Solution |
|---|---|---|
| Legal risk from scraping | Blocked domain not filtered | Enforce domain blocklist |
| Credit overrun | No budget tracking | Implement daily credit caps |
| Junk data in pipeline | Error pages scraped | Validate content quality |
| Target site blocking IP | Too aggressive crawling | Enforce per-domain rate limits |
validateScrapeTarget(url);
budget.authorize(estimatedPages);
const results = await firecrawl.crawlUrl(url, { limit: estimatedPages });
const valid = results.filter(validateScrapedContent);
budget.record(valid.length);