AI Agent Web Research
Enable AI agents (Claude, GPT, LangChain) to browse the web reliably through stealth browsers, avoiding blocks and CAPTCHAs.The Challenge
AI agents need web access for research, but:- Direct HTTP requests get blocked by anti-bot systems
- APIs have rate limits and don’t render JavaScript
- Captchas break automated flows
- Agent frameworks need structured tool interfaces
MCP Server Setup
// ~/.config/claude/mcp.json
{
"mcpServers": {
"meshbrow": {
"command": "npx",
"args": ["@meshbrow/mcp-server"],
"env": {
"MESHBROW_API_KEY": "mb_live_..."
}
}
}
}
LangChain Integration
import { Meshbrow } from '@meshbrow/sdk';
import { chromium } from 'playwright';
import { Tool } from 'langchain/tools';
class MeshbrowBrowseTool extends Tool {
name = 'browse_web';
description = 'Browse a webpage and extract its text content. Use for research, fact-checking, and data gathering.';
private client: Meshbrow;
private sessionId: string | null = null;
private cdpUrl: string | null = null;
constructor() {
super();
this.client = new Meshbrow({ apiKey: process.env.MESHBROW_API_KEY! });
}
async _call(url: string): Promise<string> {
// Reuse session if available, create if not
if (!this.sessionId) {
const session = await this.client.sessions.create({
proxy: { type: 'residential', country: 'US' },
stealth: 'max',
timeout: 600,
});
this.sessionId = session.id;
this.cdpUrl = session.cdpUrl;
}
const browser = await chromium.connectOverCDP(this.cdpUrl!);
const page = browser.contexts()[0].pages()[0];
try {
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
await page.waitForTimeout(2000); // Let dynamic content load
// Extract structured content
const content = await page.evaluate(() => {
// Remove scripts, styles, nav, footer
const removes = document.querySelectorAll(
'script, style, nav, footer, header, aside, [role="banner"], [role="navigation"]'
);
removes.forEach((el) => el.remove());
// Get main content
const main =
document.querySelector('main') ||
document.querySelector('article') ||
document.querySelector('[role="main"]') ||
document.body;
return main?.innerText?.slice(0, 8000) || 'No content found';
});
return `URL: ${url}\n\n${content}`;
} catch (error) {
return `Error browsing ${url}: ${(error as Error).message}`;
}
}
async cleanup() {
if (this.sessionId) {
await this.client.sessions.destroy(this.sessionId);
this.sessionId = null;
}
}
}
// Usage with LangChain agent
import { ChatOpenAI } from 'langchain/chat_models/openai';
import { AgentExecutor, createOpenAIFunctionsAgent } from 'langchain/agents';
const tools = [new MeshbrowBrowseTool()];
const model = new ChatOpenAI({ modelName: 'gpt-4' });
const agent = await createOpenAIFunctionsAgent({ llm: model, tools, prompt });
const executor = new AgentExecutor({ agent, tools });
const result = await executor.invoke({
input: 'Research the latest pricing for AWS Lambda and compare it to Google Cloud Functions',
});
Multi-Page Research Agent
import { Meshbrow } from '@meshbrow/sdk';
import { chromium, Page } from 'playwright';
interface ResearchResult {
url: string;
title: string;
content: string;
links: string[];
}
class WebResearcher {
private client: Meshbrow;
private page: Page | null = null;
private sessionId: string | null = null;
constructor(apiKey: string) {
this.client = new Meshbrow({ apiKey });
}
async init() {
const session = await this.client.sessions.create({
proxy: { type: 'residential', country: 'US' },
stealth: 'max',
timeout: 1800, // 30 min for research sessions
});
this.sessionId = session.id;
const browser = await chromium.connectOverCDP(session.cdpUrl);
this.page = browser.contexts()[0].pages()[0];
}
async research(url: string): Promise<ResearchResult> {
if (!this.page) throw new Error('Not initialized');
await this.page.goto(url, { waitUntil: 'domcontentloaded' });
await this.page.waitForTimeout(1500);
const title = await this.page.title();
const content = await this.page.evaluate(() => {
const main = document.querySelector('main, article, [role="main"]') || document.body;
return main.innerText.slice(0, 5000);
});
const links = await this.page.evaluate(() => {
return Array.from(document.querySelectorAll('a[href]'))
.map((a) => ({ text: a.textContent?.trim(), href: (a as HTMLAnchorElement).href }))
.filter((l) => l.text && l.href.startsWith('http'))
.slice(0, 20)
.map((l) => `${l.text}: ${l.href}`);
});
return { url, title, content, links };
}
async searchAndResearch(query: string): Promise<ResearchResult[]> {
if (!this.page) throw new Error('Not initialized');
// Search via DuckDuckGo (doesn't require JS)
const searchUrl = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`;
await this.page.goto(searchUrl);
const resultUrls = await this.page.evaluate(() => {
return Array.from(document.querySelectorAll('.result__a'))
.slice(0, 5)
.map((a) => (a as HTMLAnchorElement).href);
});
const results: ResearchResult[] = [];
for (const url of resultUrls) {
try {
const result = await this.research(url);
results.push(result);
await this.page.waitForTimeout(1000 + Math.random() * 2000);
} catch {
// Skip failed pages
}
}
return results;
}
async destroy() {
if (this.sessionId) {
await this.client.sessions.destroy(this.sessionId);
}
}
}
// Usage
const researcher = new WebResearcher(process.env.MESHBROW_API_KEY!);
await researcher.init();
const results = await researcher.searchAndResearch('best practices GraphQL API design 2026');
for (const r of results) {
console.log(`\n## ${r.title}\n${r.content.slice(0, 500)}...`);
}
await researcher.destroy();
Key Takeaways
- Reuse sessions — Create one browser and navigate across pages instead of spinning up new sessions per URL
- Extract text, not HTML — Agents work better with clean text than raw markup
- Set generous timeouts — Research sessions may take 10-30 minutes
- Handle errors gracefully — Some pages will fail; skip and continue