AINode.jsAutomation

AI Web Scraper with Smart Content Parsing

TT
TopicTrick Team
AI Web Scraper with Smart Content Parsing

AI Web Scraper with Smart Content Parsing

Traditional web scraping breaks every time a website redesigns. This tool uses Puppeteer to render any page in a real browser, then sends the HTML to GPT-4o to extract whatever structured data you need — no CSS selectors, no maintenance.

This is Tool 7 of the Build 50 AI Automation Tools course.


What You'll Build

  • POST /scrape — provide a URL and describe the data you want, receive structured JSON
  • Works on any website including JavaScript-heavy SPAs
  • Automatically handles pagination and dynamic content

Setup

bash
mkdir ai-scraper && cd ai-scraper
npm init -y
npm install express puppeteer openai dotenv
# Puppeteer downloads Chromium automatically (~170MB)
bash
# .env
OPENAI_API_KEY=sk-your-key-here
PORT=3000

Scraper Service

js
// src/services/scraperService.js
import puppeteer from 'puppeteer';
import OpenAI from 'openai';

const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

async function fetchPageHTML(url) {
  const browser = await puppeteer.launch({
    headless: true,
    args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
  });

  try {
    const page = await browser.newPage();
    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36');
    await page.setViewport({ width: 1280, height: 800 });
    await page.goto(url, { waitUntil: 'networkidle2', timeout: 30_000 });

    // Wait for common content containers
    await page.waitForSelector('body', { timeout: 5000 }).catch(() => {});

    // Extract clean HTML (remove scripts, styles, SVGs to save tokens)
    const html = await page.evaluate(() => {
      document.querySelectorAll('script, style, svg, noscript, iframe').forEach(el => el.remove());
      return document.body?.innerText || document.documentElement.innerHTML;
    });

    return html.slice(0, 40_000); // Limit to ~10k tokens
  } finally {
    await browser.close();
  }
}

export async function scrapeAndExtract(url, extractionGoal) {
  const html = await fetchPageHTML(url);

  const response = await openai.chat.completions.create({
    model: 'gpt-4o',
    messages: [
      {
        role: 'system',
        content: `You are a web data extraction expert. Extract structured data from the webpage content below.
The user wants to extract: ${extractionGoal}
Return ONLY a JSON object with the extracted data. Use clear, descriptive field names.
If a requested field is not found, set it to null.`,
      },
      {
        role: 'user',
        content: `URL: ${url}\n\nPage content:\n${html}`,
      },
    ],
    temperature: 0.1,
    response_format: { type: 'json_object' },
  });

  return {
    url,
    extractedAt: new Date().toISOString(),
    data: JSON.parse(response.choices[0].message.content),
  };
}

export async function scrapeBatch(urls, extractionGoal) {
  const results = [];
  for (const url of urls) {
    try {
      const result = await scrapeAndExtract(url, extractionGoal);
      results.push({ success: true, ...result });
    } catch (err) {
      results.push({ success: false, url, error: err.message });
    }
    // Polite delay between requests
    await new Promise(r => setTimeout(r, 1500 + Math.random() * 1000));
  }
  return results;
}

API Route + Server

js
// src/server.js
import 'dotenv/config';
import express from 'express';
import { scrapeAndExtract, scrapeBatch } from './services/scraperService.js';

const app = express();
app.use(express.json());

// Single URL scrape
app.post('/scrape', async (req, res, next) => {
  try {
    const { url, extract } = req.body;
    if (!url || !extract) return res.status(400).json({ error: 'url and extract fields required' });
    const result = await scrapeAndExtract(url, extract);
    res.json({ success: true, ...result });
  } catch (err) { next(err); }
});

// Batch scrape
app.post('/scrape/batch', async (req, res, next) => {
  try {
    const { urls, extract } = req.body;
    if (!urls?.length || !extract) return res.status(400).json({ error: 'urls array and extract required' });
    const results = await scrapeBatch(urls, extract);
    res.json({ success: true, count: results.length, results });
  } catch (err) { next(err); }
});

app.get('/health', (_req, res) => res.json({ status: 'ok' }));
app.use((err, _req, res, _next) => res.status(500).json({ error: err.message }));
app.listen(process.env.PORT ?? 3000, () => console.log('AI Scraper running'));

Testing

bash
# Extract product info from an e-commerce page
curl -X POST http://localhost:3000/scrape \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://example-store.com/product/123",
    "extract": "product name, price, description, rating, number of reviews, availability status, and all image URLs"
  }'

# Extract all job listings from a jobs page
curl -X POST http://localhost:3000/scrape \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://careers.example.com/jobs",
    "extract": "array of job listings, each with title, department, location, type (remote/hybrid/onsite), and apply URL"
  }'

Sample response:

json
{
  "url": "https://example-store.com/product/123",
  "extractedAt": "2025-11-15T10:23:45Z",
  "data": {
    "productName": "Sony WH-1000XM5 Headphones",
    "price": 349.99,
    "currency": "USD",
    "rating": 4.7,
    "reviewCount": 2841,
    "availability": "In Stock",
    "description": "Industry-leading noise cancelling headphones with 30-hour battery life",
    "imageUrls": [
      "https://example-store.com/images/wh1000xm5-black-1.jpg",
      "https://example-store.com/images/wh1000xm5-black-2.jpg"
    ]
  }
}

Pagination Support

js
export async function scrapeWithPagination(baseUrl, extractionGoal, maxPages = 5) {
  const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] });
  const allData = [];

  try {
    const page = await browser.newPage();
    let currentPage = 1;
    let hasNextPage = true;

    while (hasNextPage && currentPage <= maxPages) {
      const url = `${baseUrl}?page=${currentPage}`;
      await page.goto(url, { waitUntil: 'networkidle2' });
      const html = await page.evaluate(() => document.body.innerText);

      const response = await openai.chat.completions.create({
        model: 'gpt-4o-mini',
        messages: [
          { role: 'system', content: `Extract: ${extractionGoal}. Return JSON with "items" array and "hasMore" boolean.` },
          { role: 'user', content: html.slice(0, 40_000) },
        ],
        response_format: { type: 'json_object' },
      });

      const result = JSON.parse(response.choices[0].message.content);
      allData.push(...(result.items || []));
      hasNextPage = result.hasMore === true;
      currentPage++;

      await new Promise(r => setTimeout(r, 1000));
    }
  } finally {
    await browser.close();
  }

  return allData;
}

Build 50 AI Automation Tools — Tool 7 of 50

AI web scraping is live. Continue to Tool 8 to build a news aggregator that fetches and summarizes daily news.


    Summary

    • Puppeteer renders the full page including JavaScript before extraction
    • Sending innerText instead of raw HTML removes tags, scripts, and styles — reducing tokens by 80%+
    • The natural language extraction goal replaces hundreds of site-specific CSS selectors
    • Polite delays between requests reduce the risk of IP banning
    • Pagination support chains multiple pages into a single extraction run

    Continue to Tool 8: News Aggregator & AI Summarizer →