AINode.jsAutomation
AI Web Scraper with Smart Content Parsing
TT
TopicTrick Team
AI Web Scraper with Smart Content Parsing
Traditional web scraping breaks every time a website redesigns. This tool uses Puppeteer to render any page in a real browser, then sends the HTML to GPT-4o to extract whatever structured data you need — no CSS selectors, no maintenance.
This is Tool 7 of the Build 50 AI Automation Tools course.
What You'll Build
POST /scrape— provide a URL and describe the data you want, receive structured JSON- Works on any website including JavaScript-heavy SPAs
- Automatically handles pagination and dynamic content
Setup
bash
mkdir ai-scraper && cd ai-scraper
npm init -y
npm install express puppeteer openai dotenv
# Puppeteer downloads Chromium automatically (~170MB)bash
# .env
OPENAI_API_KEY=sk-your-key-here
PORT=3000Scraper Service
js
// src/services/scraperService.js
import puppeteer from 'puppeteer';
import OpenAI from 'openai';
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
async function fetchPageHTML(url) {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
try {
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36');
await page.setViewport({ width: 1280, height: 800 });
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30_000 });
// Wait for common content containers
await page.waitForSelector('body', { timeout: 5000 }).catch(() => {});
// Extract clean HTML (remove scripts, styles, SVGs to save tokens)
const html = await page.evaluate(() => {
document.querySelectorAll('script, style, svg, noscript, iframe').forEach(el => el.remove());
return document.body?.innerText || document.documentElement.innerHTML;
});
return html.slice(0, 40_000); // Limit to ~10k tokens
} finally {
await browser.close();
}
}
export async function scrapeAndExtract(url, extractionGoal) {
const html = await fetchPageHTML(url);
const response = await openai.chat.completions.create({
model: 'gpt-4o',
messages: [
{
role: 'system',
content: `You are a web data extraction expert. Extract structured data from the webpage content below.
The user wants to extract: ${extractionGoal}
Return ONLY a JSON object with the extracted data. Use clear, descriptive field names.
If a requested field is not found, set it to null.`,
},
{
role: 'user',
content: `URL: ${url}\n\nPage content:\n${html}`,
},
],
temperature: 0.1,
response_format: { type: 'json_object' },
});
return {
url,
extractedAt: new Date().toISOString(),
data: JSON.parse(response.choices[0].message.content),
};
}
export async function scrapeBatch(urls, extractionGoal) {
const results = [];
for (const url of urls) {
try {
const result = await scrapeAndExtract(url, extractionGoal);
results.push({ success: true, ...result });
} catch (err) {
results.push({ success: false, url, error: err.message });
}
// Polite delay between requests
await new Promise(r => setTimeout(r, 1500 + Math.random() * 1000));
}
return results;
}API Route + Server
js
// src/server.js
import 'dotenv/config';
import express from 'express';
import { scrapeAndExtract, scrapeBatch } from './services/scraperService.js';
const app = express();
app.use(express.json());
// Single URL scrape
app.post('/scrape', async (req, res, next) => {
try {
const { url, extract } = req.body;
if (!url || !extract) return res.status(400).json({ error: 'url and extract fields required' });
const result = await scrapeAndExtract(url, extract);
res.json({ success: true, ...result });
} catch (err) { next(err); }
});
// Batch scrape
app.post('/scrape/batch', async (req, res, next) => {
try {
const { urls, extract } = req.body;
if (!urls?.length || !extract) return res.status(400).json({ error: 'urls array and extract required' });
const results = await scrapeBatch(urls, extract);
res.json({ success: true, count: results.length, results });
} catch (err) { next(err); }
});
app.get('/health', (_req, res) => res.json({ status: 'ok' }));
app.use((err, _req, res, _next) => res.status(500).json({ error: err.message }));
app.listen(process.env.PORT ?? 3000, () => console.log('AI Scraper running'));Testing
bash
# Extract product info from an e-commerce page
curl -X POST http://localhost:3000/scrape \
-H "Content-Type: application/json" \
-d '{
"url": "https://example-store.com/product/123",
"extract": "product name, price, description, rating, number of reviews, availability status, and all image URLs"
}'
# Extract all job listings from a jobs page
curl -X POST http://localhost:3000/scrape \
-H "Content-Type: application/json" \
-d '{
"url": "https://careers.example.com/jobs",
"extract": "array of job listings, each with title, department, location, type (remote/hybrid/onsite), and apply URL"
}'Sample response:
json
{
"url": "https://example-store.com/product/123",
"extractedAt": "2025-11-15T10:23:45Z",
"data": {
"productName": "Sony WH-1000XM5 Headphones",
"price": 349.99,
"currency": "USD",
"rating": 4.7,
"reviewCount": 2841,
"availability": "In Stock",
"description": "Industry-leading noise cancelling headphones with 30-hour battery life",
"imageUrls": [
"https://example-store.com/images/wh1000xm5-black-1.jpg",
"https://example-store.com/images/wh1000xm5-black-2.jpg"
]
}
}Pagination Support
js
export async function scrapeWithPagination(baseUrl, extractionGoal, maxPages = 5) {
const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] });
const allData = [];
try {
const page = await browser.newPage();
let currentPage = 1;
let hasNextPage = true;
while (hasNextPage && currentPage <= maxPages) {
const url = `${baseUrl}?page=${currentPage}`;
await page.goto(url, { waitUntil: 'networkidle2' });
const html = await page.evaluate(() => document.body.innerText);
const response = await openai.chat.completions.create({
model: 'gpt-4o-mini',
messages: [
{ role: 'system', content: `Extract: ${extractionGoal}. Return JSON with "items" array and "hasMore" boolean.` },
{ role: 'user', content: html.slice(0, 40_000) },
],
response_format: { type: 'json_object' },
});
const result = JSON.parse(response.choices[0].message.content);
allData.push(...(result.items || []));
hasNextPage = result.hasMore === true;
currentPage++;
await new Promise(r => setTimeout(r, 1000));
}
} finally {
await browser.close();
}
return allData;
}Build 50 AI Automation Tools — Tool 7 of 50
AI web scraping is live. Continue to Tool 8 to build a news aggregator that fetches and summarizes daily news.
Summary
- Puppeteer renders the full page including JavaScript before extraction
- Sending innerText instead of raw HTML removes tags, scripts, and styles — reducing tokens by 80%+
- The natural language extraction goal replaces hundreds of site-specific CSS selectors
- Polite delays between requests reduce the risk of IP banning
- Pagination support chains multiple pages into a single extraction run
Continue to Tool 8: News Aggregator & AI Summarizer →
Post Navigation (Previous/Next)
