Web Scraping & Data Extraction
Extract structured data from complex websites that use JavaScript rendering, anti-bot measures, and dynamic content loading.The Challenge
Modern websites make scraping difficult with:- Client-side rendering (SPAs) that require full JS execution
- Aggressive bot detection (Cloudflare, DataDome, PerimeterX)
- Rate limiting and IP blocking
- Infinite scroll and lazy-loaded content
- Dynamic selectors and obfuscated class names
Single-Page Extraction
import { Meshbrow } from '@meshbrow/sdk';
import { chromium, Page } from 'playwright';
const client = new Meshbrow({ apiKey: process.env.MESHBROW_API_KEY! });
interface Product {
title: string;
price: number;
currency: string;
rating: number;
reviews: number;
url: string;
imageUrl: string;
availability: string;
}
async function scrapeProductPage(url: string): Promise<Product> {
const session = await client.sessions.create({
stealth: 'max',
proxy: { type: 'residential', country: 'US' },
});
const browser = await chromium.connectOverCDP(session.cdpUrl);
const page = browser.contexts()[0].pages()[0];
try {
await page.goto(url, { waitUntil: 'networkidle' });
await page.waitForTimeout(2000);
const product = await page.evaluate(() => {
// Use structured data (JSON-LD) if available
const jsonLd = document.querySelector('script[type="application/ld+json"]');
if (jsonLd) {
try {
const data = JSON.parse(jsonLd.textContent || '');
if (data['@type'] === 'Product') {
return {
title: data.name,
price: parseFloat(data.offers?.price || '0'),
currency: data.offers?.priceCurrency || 'USD',
rating: parseFloat(data.aggregateRating?.ratingValue || '0'),
reviews: parseInt(data.aggregateRating?.reviewCount || '0'),
url: window.location.href,
imageUrl: data.image?.[0] || data.image || '',
availability: data.offers?.availability?.includes('InStock') ? 'In Stock' : 'Out of Stock',
};
}
} catch {}
}
// Fallback: DOM extraction
const title = document.querySelector('h1')?.textContent?.trim() || '';
const priceEl = document.querySelector('[data-price], .price, .product-price');
const priceText = priceEl?.textContent || '0';
const price = parseFloat(priceText.replace(/[^0-9.]/g, ''));
return {
title,
price,
currency: 'USD',
rating: 0,
reviews: 0,
url: window.location.href,
imageUrl: (document.querySelector('.product-image img, [data-image]') as HTMLImageElement)?.src || '',
availability: 'Unknown',
};
});
return product;
} finally {
await client.sessions.destroy(session.id);
}
}
Infinite Scroll Handling
async function scrapeInfiniteScroll(
url: string,
maxItems: number = 100
): Promise<any[]> {
const session = await client.sessions.create({
stealth: 'max',
proxy: { type: 'residential', country: 'US' },
timeout: 600, // 10 min for large scrapes
});
const browser = await chromium.connectOverCDP(session.cdpUrl);
const page = browser.contexts()[0].pages()[0];
const items: any[] = [];
try {
await page.goto(url, { waitUntil: 'networkidle' });
await page.waitForTimeout(3000);
let previousHeight = 0;
let staleCount = 0;
while (items.length < maxItems && staleCount < 3) {
// Extract visible items
const newItems = await page.evaluate((existingCount) => {
const elements = document.querySelectorAll('.item, .card, [data-item]');
return Array.from(elements)
.slice(existingCount)
.map((el) => ({
title: el.querySelector('h2, h3, .title')?.textContent?.trim(),
link: (el.querySelector('a') as HTMLAnchorElement)?.href,
description: el.querySelector('p, .desc')?.textContent?.trim(),
}));
}, items.length);
items.push(...newItems);
// Scroll down
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) {
staleCount++;
} else {
staleCount = 0;
}
previousHeight = currentHeight;
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000 + Math.random() * 2000);
// Click "Load More" button if present
const loadMore = await page.$('button:has-text("Load More"), a:has-text("Show More")');
if (loadMore) {
await loadMore.click();
await page.waitForTimeout(2000);
}
}
} finally {
await client.sessions.destroy(session.id);
}
return items.slice(0, maxItems);
}
Anti-Bot Bypass with Session Warming
async function scrapeProtectedSite(targetUrl: string): Promise<string> {
// Create persistent profile for this site
const session = await client.sessions.create({
stealth: 'max',
proxy: { type: 'residential', country: 'US' },
fingerprint: {
platform: 'Win32',
vendor: 'Google Inc.',
},
});
const browser = await chromium.connectOverCDP(session.cdpUrl);
const page = browser.contexts()[0].pages()[0];
try {
// Step 1: Warm up session with natural browsing
await page.goto('https://www.google.com');
await page.waitForTimeout(2000);
// Search for the site (builds referrer chain)
const domain = new URL(targetUrl).hostname;
await page.fill('input[name="q"]', domain);
await page.keyboard.press('Enter');
await page.waitForNavigation();
await page.waitForTimeout(2000);
// Step 2: Navigate through referrer
await page.goto(targetUrl, {
referer: 'https://www.google.com',
waitUntil: 'networkidle',
});
// Step 3: Handle Cloudflare challenge if present
const isChallenged = await page.evaluate(() => {
return document.title.includes('Just a moment') ||
document.querySelector('#challenge-running') !== null;
});
if (isChallenged) {
// Wait for challenge to auto-solve (Meshbrow handles this)
await page.waitForFunction(
() => !document.title.includes('Just a moment'),
{ timeout: 30000 }
);
await page.waitForTimeout(2000);
}
// Step 4: Extract content
const content = await page.evaluate(() => document.body.innerHTML);
return content;
} finally {
await client.sessions.destroy(session.id);
}
}
Concurrent Scraping with Proxy Rotation
class ScrapingPool {
private client: Meshbrow;
private maxConcurrency: number;
private activeCount = 0;
private queue: Array<() => Promise<void>> = [];
constructor(apiKey: string, maxConcurrency: number = 5) {
this.client = new Meshbrow({ apiKey });
this.maxConcurrency = maxConcurrency;
}
async scrapeUrls<T>(
urls: string[],
extractor: (page: Page) => Promise<T>
): Promise<Map<string, T>> {
const results = new Map<string, T>();
const tasks = urls.map((url) => async () => {
const session = await this.client.sessions.create({
stealth: 'max',
proxy: { type: 'residential', rotate: true }, // New IP per session
});
const browser = await chromium.connectOverCDP(session.cdpUrl);
const page = browser.contexts()[0].pages()[0];
try {
await page.goto(url, { waitUntil: 'networkidle', timeout: 30000 });
await page.waitForTimeout(1000 + Math.random() * 2000);
const data = await extractor(page);
results.set(url, data);
} catch (error) {
console.error(`Failed: ${url}`, error);
} finally {
await this.client.sessions.destroy(session.id);
}
});
// Execute with concurrency control
await this.executeBatch(tasks);
return results;
}
private async executeBatch(tasks: Array<() => Promise<void>>) {
const executing: Promise<void>[] = [];
for (const task of tasks) {
const p = task().then(() => {
executing.splice(executing.indexOf(p), 1);
});
executing.push(p);
if (executing.length >= this.maxConcurrency) {
await Promise.race(executing);
}
// Brief delay between launches
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
await Promise.all(executing);
}
}
// Usage
const pool = new ScrapingPool(process.env.MESHBROW_API_KEY!, 5);
const urls = [
'https://example.com/page/1',
'https://example.com/page/2',
'https://example.com/page/3',
];
const data = await pool.scrapeUrls(urls, async (page) => {
return page.evaluate(() => ({
title: document.title,
text: document.body.innerText.slice(0, 5000),
links: Array.from(document.querySelectorAll('a[href]'))
.map((a) => (a as HTMLAnchorElement).href)
.slice(0, 50),
}));
});
Structured Data Extraction Patterns
// Extract tables
async function extractTable(page: Page, tableSelector: string) {
return page.evaluate((sel) => {
const table = document.querySelector(sel) as HTMLTableElement;
if (!table) return [];
const headers = Array.from(table.querySelectorAll('th')).map(
(th) => th.textContent?.trim() || ''
);
return Array.from(table.querySelectorAll('tbody tr')).map((row) => {
const cells = Array.from(row.querySelectorAll('td'));
const obj: Record<string, string> = {};
cells.forEach((cell, i) => {
obj[headers[i] || `col_${i}`] = cell.textContent?.trim() || '';
});
return obj;
});
}, tableSelector);
}
// Extract list items
async function extractList(page: Page, listSelector: string) {
return page.evaluate((sel) => {
return Array.from(document.querySelectorAll(`${sel} li`)).map(
(li) => li.textContent?.trim() || ''
);
}, listSelector);
}
// Extract meta/SEO data
async function extractMeta(page: Page) {
return page.evaluate(() => ({
title: document.title,
description: document.querySelector('meta[name="description"]')?.getAttribute('content'),
canonical: document.querySelector('link[rel="canonical"]')?.getAttribute('href'),
ogImage: document.querySelector('meta[property="og:image"]')?.getAttribute('content'),
h1: document.querySelector('h1')?.textContent?.trim(),
h2s: Array.from(document.querySelectorAll('h2')).map((h) => h.textContent?.trim()),
}));
}
Key Takeaways
- Residential proxies + rotation — New IP per session avoids IP blocks
- Session warming — Browse naturally before targeting protected pages
- Structured data first — JSON-LD gives you clean data without DOM parsing
- Concurrency control — 3-5 parallel sessions balances speed and stealth
- Handle infinite scroll — Track scroll height to detect end of content
- Cloudflare auto-solve — Meshbrow’s stealth handles most challenges automatically
- Timeout generously — Protected sites may take 30s+ to clear challenges