AINode.jsAutomation

AI Web Scraper with Smart Content Parsing

Build an intelligent web scraper using Node.js, Puppeteer, and GPT-4o that extracts structured data from any webpage without writing custom CSS selectors — just describe what you want.

TT
Emily Ross
5 min read
AI Web Scraper with Smart Content Parsing

AI Web Scraper with Smart Content Parsing

Traditional web scraping breaks every time a website redesigns. This tool uses Puppeteer to render any page in a real browser, then sends the HTML to GPT-4o to extract whatever structured data you need — no CSS selectors, no maintenance.

This is Tool 7 of the Build 50 AI Automation Tools course.


What You'll Build

  • POST /scrape — provide a URL and describe the data you want, receive structured JSON
  • Works on any website including JavaScript-heavy SPAs
  • Automatically handles pagination and dynamic content

Setup

bash
mkdir ai-scraper && cd ai-scraper
npm init -y
npm install express puppeteer openai dotenv
# Puppeteer downloads Chromium automatically (~170MB)
bash
# .env
OPENAI_API_KEY=sk-your-key-here
PORT=3000

Scraper Service

js
// src/services/scraperService.js
import puppeteer from 'puppeteer';
import OpenAI from 'openai';

const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

async function fetchPageHTML(url) {
  const browser = await puppeteer.launch({
    headless: true,
    args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
  });

  try {
    const page = await browser.newPage();
    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36');
    await page.setViewport({ width: 1280, height: 800 });
    await page.goto(url, { waitUntil: 'networkidle2', timeout: 30_000 });

    // Wait for common content containers
    await page.waitForSelector('body', { timeout: 5000 }).catch(() => {});

    // Extract clean HTML (remove scripts, styles, SVGs to save tokens)
    const html = await page.evaluate(() => {
      document.querySelectorAll('script, style, svg, noscript, iframe').forEach(el => el.remove());
      return document.body?.innerText || document.documentElement.innerHTML;
    });

    return html.slice(0, 40_000); // Limit to ~10k tokens
  } finally {
    await browser.close();
  }
}

export async function scrapeAndExtract(url, extractionGoal) {
  const html = await fetchPageHTML(url);

  const response = await openai.chat.completions.create({
    model: 'gpt-4o',
    messages: [
      {
        role: 'system',
        content: `You are a web data extraction expert. Extract structured data from the webpage content below.
The user wants to extract: ${extractionGoal}
Return ONLY a JSON object with the extracted data. Use clear, descriptive field names.
If a requested field is not found, set it to null.`,
      },
      {
        role: 'user',
        content: `URL: ${url}\n\nPage content:\n${html}`,
      },
    ],
    temperature: 0.1,
    response_format: { type: 'json_object' },
  });

  return {
    url,
    extractedAt: new Date().toISOString(),
    data: JSON.parse(response.choices[0].message.content),
  };
}

export async function scrapeBatch(urls, extractionGoal) {
  const results = [];
  for (const url of urls) {
    try {
      const result = await scrapeAndExtract(url, extractionGoal);
      results.push({ success: true, ...result });
    } catch (err) {
      results.push({ success: false, url, error: err.message });
    }
    // Polite delay between requests
    await new Promise(r => setTimeout(r, 1500 + Math.random() * 1000));
  }
  return results;
}

API Route + Server

js
// src/server.js
import 'dotenv/config';
import express from 'express';
import { scrapeAndExtract, scrapeBatch } from './services/scraperService.js';

const app = express();
app.use(express.json());

// Single URL scrape
app.post('/scrape', async (req, res, next) => {
  try {
    const { url, extract } = req.body;
    if (!url || !extract) return res.status(400).json({ error: 'url and extract fields required' });
    const result = await scrapeAndExtract(url, extract);
    res.json({ success: true, ...result });
  } catch (err) { next(err); }
});

// Batch scrape
app.post('/scrape/batch', async (req, res, next) => {
  try {
    const { urls, extract } = req.body;
    if (!urls?.length || !extract) return res.status(400).json({ error: 'urls array and extract required' });
    const results = await scrapeBatch(urls, extract);
    res.json({ success: true, count: results.length, results });
  } catch (err) { next(err); }
});

app.get('/health', (_req, res) => res.json({ status: 'ok' }));
app.use((err, _req, res, _next) => res.status(500).json({ error: err.message }));
app.listen(process.env.PORT ?? 3000, () => console.log('AI Scraper running'));

Testing

bash
# Extract product info from an e-commerce page
curl -X POST http://localhost:3000/scrape \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://example-store.com/product/123",
    "extract": "product name, price, description, rating, number of reviews, availability status, and all image URLs"
  }'

# Extract all job listings from a jobs page
curl -X POST http://localhost:3000/scrape \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://careers.example.com/jobs",
    "extract": "array of job listings, each with title, department, location, type (remote/hybrid/onsite), and apply URL"
  }'

Sample response:

json
{
  "url": "https://example-store.com/product/123",
  "extractedAt": "2025-11-15T10:23:45Z",
  "data": {
    "productName": "Sony WH-1000XM5 Headphones",
    "price": 349.99,
    "currency": "USD",
    "rating": 4.7,
    "reviewCount": 2841,
    "availability": "In Stock",
    "description": "Industry-leading noise cancelling headphones with 30-hour battery life",
    "imageUrls": [
      "https://example-store.com/images/wh1000xm5-black-1.jpg",
      "https://example-store.com/images/wh1000xm5-black-2.jpg"
    ]
  }
}

Pagination Support

js
export async function scrapeWithPagination(baseUrl, extractionGoal, maxPages = 5) {
  const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] });
  const allData = [];

  try {
    const page = await browser.newPage();
    let currentPage = 1;
    let hasNextPage = true;

    while (hasNextPage && currentPage <= maxPages) {
      const url = `${baseUrl}?page=${currentPage}`;
      await page.goto(url, { waitUntil: 'networkidle2' });
      const html = await page.evaluate(() => document.body.innerText);

      const response = await openai.chat.completions.create({
        model: 'gpt-4o-mini',
        messages: [
          { role: 'system', content: `Extract: ${extractionGoal}. Return JSON with "items" array and "hasMore" boolean.` },
          { role: 'user', content: html.slice(0, 40_000) },
        ],
        response_format: { type: 'json_object' },
      });

      const result = JSON.parse(response.choices[0].message.content);
      allData.push(...(result.items || []));
      hasNextPage = result.hasMore === true;
      currentPage++;

      await new Promise(r => setTimeout(r, 1000));
    }
  } finally {
    await browser.close();
  }

  return allData;
}

Build 50 AI Automation Tools — Tool 7 of 50

AI web scraping is live. Continue to Tool 8 to build a news aggregator that fetches and summarizes daily news.


    Summary

    • Puppeteer renders the full page including JavaScript before extraction
    • Sending innerText instead of raw HTML removes tags, scripts, and styles — reducing tokens by 80%+
    • The natural language extraction goal replaces hundreds of site-specific CSS selectors
    • Polite delays between requests reduce the risk of IP banning
    • Pagination support chains multiple pages into a single extraction run

    Continue to Tool 8: News Aggregator & AI Summarizer →