How is AI web scraping different from traditional Puppeteer scraping?

Traditional scraping requires writing specific CSS selectors for each site (h1.title, div.price, etc.) that break whenever the site redesigns. AI scraping sends the page HTML to GPT-4o and describes what data to extract in plain English. It works on any site without site-specific code and adapts automatically to layout changes.

Does this approach work on JavaScript-heavy single-page apps?

Yes. Puppeteer renders the full page including JavaScript before extracting the HTML, so dynamically loaded content is captured. Unlike simple HTTP requests (fetch/axios), Puppeteer can wait for specific elements, scroll to load more content, and interact with the page before extraction.

How do I avoid getting blocked while scraping?

Use realistic browser headers and User-Agent strings, add random delays between requests (1-3 seconds), rotate residential proxies for high-volume scraping, and use Puppeteer Stealth mode (puppeteer-extra-plugin-stealth) to bypass bot detection. Always respect robots.txt and the site's terms of service.

← Back to Build 50 AI Automation Tools

AI Web Scraper with Smart Content Parsing

Traditional web scraping breaks every time a website redesigns. This tool uses Puppeteer to render any page in a real browser, then sends the HTML to GPT-4o to extract whatever structured data you need — no CSS selectors, no maintenance.

This is Tool 7 of the Build 50 AI Automation Tools course.

What You'll Build

POST /scrape — provide a URL and describe the data you want, receive structured JSON
Works on any website including JavaScript-heavy SPAs
Automatically handles pagination and dynamic content

Setup

bash

mkdir ai-scraper && cd ai-scraper
npm init -y
npm install express puppeteer openai dotenv
# Puppeteer downloads Chromium automatically (~170MB)

bash

# .env
OPENAI_API_KEY=sk-your-key-here
PORT=3000

Scraper Service

// src/services/scraperService.js
import puppeteer from 'puppeteer';
import OpenAI from 'openai';

const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

async function fetchPageHTML(url) {
  const browser = await puppeteer.launch({
    headless: true,
    args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
  });

  try {
    const page = await browser.newPage();
    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36');
    await page.setViewport({ width: 1280, height: 800 });
    await page.goto(url, { waitUntil: 'networkidle2', timeout: 30_000 });

    // Wait for common content containers
    await page.waitForSelector('body', { timeout: 5000 }).catch(() => {});

    // Extract clean HTML (remove scripts, styles, SVGs to save tokens)
    const html = await page.evaluate(() => {
      document.querySelectorAll('script, style, svg, noscript, iframe').forEach(el => el.remove());
      return document.body?.innerText || document.documentElement.innerHTML;
    });

    return html.slice(0, 40_000); // Limit to ~10k tokens
  } finally {
    await browser.close();
  }
}

export async function scrapeAndExtract(url, extractionGoal) {
  const html = await fetchPageHTML(url);

  const response = await openai.chat.completions.create({
    model: 'gpt-4o',
    messages: [
      {
        role: 'system',
        content: `You are a web data extraction expert. Extract structured data from the webpage content below.
The user wants to extract: ${extractionGoal}
Return ONLY a JSON object with the extracted data. Use clear, descriptive field names.
If a requested field is not found, set it to null.`,
      },
      {
        role: 'user',
        content: `URL: ${url}\n\nPage content:\n${html}`,
      },
    ],
    temperature: 0.1,
    response_format: { type: 'json_object' },
  });

  return {
    url,
    extractedAt: new Date().toISOString(),
    data: JSON.parse(response.choices[0].message.content),
  };
}

export async function scrapeBatch(urls, extractionGoal) {
  const results = [];
  for (const url of urls) {
    try {
      const result = await scrapeAndExtract(url, extractionGoal);
      results.push({ success: true, ...result });
    } catch (err) {
      results.push({ success: false, url, error: err.message });
    }
    // Polite delay between requests
    await new Promise(r => setTimeout(r, 1500 + Math.random() * 1000));
  }
  return results;
}

API Route + Server

// src/server.js
import 'dotenv/config';
import express from 'express';
import { scrapeAndExtract, scrapeBatch } from './services/scraperService.js';

const app = express();
app.use(express.json());

// Single URL scrape
app.post('/scrape', async (req, res, next) => {
  try {
    const { url, extract } = req.body;
    if (!url || !extract) return res.status(400).json({ error: 'url and extract fields required' });
    const result = await scrapeAndExtract(url, extract);
    res.json({ success: true, ...result });
  } catch (err) { next(err); }
});

// Batch scrape
app.post('/scrape/batch', async (req, res, next) => {
  try {
    const { urls, extract } = req.body;
    if (!urls?.length || !extract) return res.status(400).json({ error: 'urls array and extract required' });
    const results = await scrapeBatch(urls, extract);
    res.json({ success: true, count: results.length, results });
  } catch (err) { next(err); }
});

app.get('/health', (_req, res) => res.json({ status: 'ok' }));
app.use((err, _req, res, _next) => res.status(500).json({ error: err.message }));
app.listen(process.env.PORT ?? 3000, () => console.log('AI Scraper running'));

Testing

bash

# Extract product info from an e-commerce page
curl -X POST http://localhost:3000/scrape \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://example-store.com/product/123",
    "extract": "product name, price, description, rating, number of reviews, availability status, and all image URLs"
  }'

# Extract all job listings from a jobs page
curl -X POST http://localhost:3000/scrape \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://careers.example.com/jobs",
    "extract": "array of job listings, each with title, department, location, type (remote/hybrid/onsite), and apply URL"
  }'

Sample response:

json

{
  "url": "https://example-store.com/product/123",
  "extractedAt": "2025-11-15T10:23:45Z",
  "data": {
    "productName": "Sony WH-1000XM5 Headphones",
    "price": 349.99,
    "currency": "USD",
    "rating": 4.7,
    "reviewCount": 2841,
    "availability": "In Stock",
    "description": "Industry-leading noise cancelling headphones with 30-hour battery life",
    "imageUrls": [
      "https://example-store.com/images/wh1000xm5-black-1.jpg",
      "https://example-store.com/images/wh1000xm5-black-2.jpg"
    ]
  }
}

Pagination Support

export async function scrapeWithPagination(baseUrl, extractionGoal, maxPages = 5) {
  const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] });
  const allData = [];

  try {
    const page = await browser.newPage();
    let currentPage = 1;
    let hasNextPage = true;

    while (hasNextPage && currentPage <= maxPages) {
      const url = `${baseUrl}?page=${currentPage}`;
      await page.goto(url, { waitUntil: 'networkidle2' });
      const html = await page.evaluate(() => document.body.innerText);

      const response = await openai.chat.completions.create({
        model: 'gpt-4o-mini',
        messages: [
          { role: 'system', content: `Extract: ${extractionGoal}. Return JSON with "items" array and "hasMore" boolean.` },
          { role: 'user', content: html.slice(0, 40_000) },
        ],
        response_format: { type: 'json_object' },
      });

      const result = JSON.parse(response.choices[0].message.content);
      allData.push(...(result.items || []));
      hasNextPage = result.hasMore === true;
      currentPage++;

      await new Promise(r => setTimeout(r, 1000));
    }
  } finally {
    await browser.close();
  }

  return allData;
}

Build 50 AI Automation Tools — Tool 7 of 50

AI web scraping is live. Continue to Tool 8 to build a news aggregator that fetches and summarizes daily news.

Summary

Puppeteer renders the full page including JavaScript before extraction
Sending innerText instead of raw HTML removes tags, scripts, and styles — reducing tokens by 80%+
The natural language extraction goal replaces hundreds of site-specific CSS selectors
Polite delays between requests reduce the risk of IP banning
Pagination support chains multiple pages into a single extraction run

Continue to Tool 8: News Aggregator & AI Summarizer →