AINode.jsAutomation

PDF Summarizer with OpenAI and Node.js

TT
TopicTrick Team
PDF Summarizer with OpenAI and Node.js

PDF Summarizer with OpenAI and Node.js

Reading a 40-page report shouldn't take 40 minutes. This tool extracts text from any PDF and generates a structured summary — key points, main conclusions, and action items — in seconds using GPT-4o.

By the end of this module you will have a working REST API that accepts a PDF upload and returns a clean, structured JSON summary ready to use in any application.

This is Tool 1 of the Build 50 AI Automation Tools course.


What You'll Build

A Node.js Express API with one endpoint:

  • POST /summarize — accepts a PDF file upload, returns a structured summary
  • Handles PDFs of any length via text chunking
  • Returns JSON with summary, keyPoints, and actionItems

Project Setup

bash
mkdir pdf-summarizer && cd pdf-summarizer
npm init -y

npm install express multer pdf-parse openai dotenv

Create the project structure:

text
pdf-summarizer/
├── src/
│   ├── server.js
│   ├── routes/summarize.js
│   └── services/pdfService.js
├── uploads/          # temporary storage for uploaded PDFs
├── .env
└── package.json
json
// package.json
{
  "type": "module",
  "scripts": {
    "start": "node src/server.js",
    "dev": "node --watch src/server.js"
  }
}

Environment Variables

bash
# .env
OPENAI_API_KEY=sk-your-openai-api-key-here
PORT=3000

Get your API key from platform.openai.com/api-keys.


The PDF Service

This service handles text extraction and OpenAI communication:

js
// src/services/pdfService.js
import pdfParse from 'pdf-parse';
import OpenAI from 'openai';

const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

// Max words per chunk — GPT-4o Mini handles ~6000 comfortably
const CHUNK_SIZE_WORDS = 6000;

/**
 * Extract raw text from a PDF buffer.
 */
export async function extractTextFromPDF(buffer) {
  const data = await pdfParse(buffer);
  return data.text;
}

/**
 * Split text into word-count chunks.
 */
function chunkText(text, maxWords = CHUNK_SIZE_WORDS) {
  const words = text.split(/\s+/);
  const chunks = [];

  for (let i = 0; i < words.length; i += maxWords) {
    chunks.push(words.slice(i, i + maxWords).join(' '));
  }

  return chunks;
}

/**
 * Summarize a single chunk of text.
 */
async function summarizeChunk(text, chunkIndex, totalChunks) {
  const context = totalChunks > 1
    ? `This is part ${chunkIndex + 1} of ${totalChunks} from a larger document. Summarize this section.`
    : 'Summarize the following document.';

  const response = await openai.chat.completions.create({
    model: 'gpt-4o-mini',
    messages: [
      {
        role: 'system',
        content: `You are an expert document analyst. ${context}
Return a JSON object with exactly these fields:
{
  "summary": "2-3 sentence overview",
  "keyPoints": ["point 1", "point 2", "point 3"],
  "actionItems": ["action 1", "action 2"]
}
Only return valid JSON, no markdown fences.`,
      },
      {
        role: 'user',
        content: text,
      },
    ],
    temperature: 0.3,
    response_format: { type: 'json_object' },
  });

  return JSON.parse(response.choices[0].message.content);
}

/**
 * Synthesize multiple chunk summaries into a final summary.
 */
async function synthesizeSummaries(chunkSummaries) {
  const combined = chunkSummaries
    .map((s, i) => `Section ${i + 1}:\nSummary: ${s.summary}\nKey Points: ${s.keyPoints.join(', ')}`)
    .join('\n\n');

  const response = await openai.chat.completions.create({
    model: 'gpt-4o-mini',
    messages: [
      {
        role: 'system',
        content: `You are an expert document analyst. Combine these section summaries into a single coherent document summary.
Return a JSON object with exactly these fields:
{
  "summary": "3-4 sentence executive summary of the entire document",
  "keyPoints": ["top 5 key points from the whole document"],
  "actionItems": ["all action items identified across all sections"]
}
Only return valid JSON, no markdown fences.`,
      },
      {
        role: 'user',
        content: combined,
      },
    ],
    temperature: 0.3,
    response_format: { type: 'json_object' },
  });

  return JSON.parse(response.choices[0].message.content);
}

/**
 * Main function: extract PDF text and return a structured summary.
 */
export async function summarizePDF(buffer) {
  const text = await extractTextFromPDF(buffer);

  if (!text || text.trim().length === 0) {
    throw new Error('No extractable text found in PDF. The file may be scanned or image-based.');
  }

  const chunks = chunkText(text);

  if (chunks.length === 1) {
    // Short document — single API call
    return await summarizeChunk(chunks[0], 0, 1);
  }

  // Long document — map-reduce pattern
  const chunkSummaries = await Promise.all(
    chunks.map((chunk, i) => summarizeChunk(chunk, i, chunks.length))
  );

  return await synthesizeSummaries(chunkSummaries);
}

File Upload Route

js
// src/routes/summarize.js
import { Router } from 'express';
import multer from 'multer';
import { summarizePDF } from '../services/pdfService.js';

const router  = Router();

// Store uploads in memory (buffer) — no disk writes needed
const upload  = multer({
  storage: multer.memoryStorage(),
  limits:  { fileSize: 50 * 1024 * 1024 }, // 50MB max
  fileFilter: (_req, file, cb) => {
    if (file.mimetype === 'application/pdf') {
      cb(null, true);
    } else {
      cb(new Error('Only PDF files are accepted'), false);
    }
  },
});

router.post('/', upload.single('pdf'), async (req, res, next) => {
  try {
    if (!req.file) {
      return res.status(400).json({ error: 'No PDF file provided. Use field name "pdf".' });
    }

    const result = await summarizePDF(req.file.buffer);

    res.json({
      success:  true,
      filename: req.file.originalname,
      size:     `${(req.file.size / 1024).toFixed(1)} KB`,
      ...result,
    });
  } catch (err) {
    next(err);
  }
});

export default router;

Express Server

js
// src/server.js
import 'dotenv/config';
import express from 'express';
import summarizeRouter from './routes/summarize.js';

const app  = express();
const PORT = process.env.PORT ?? 3000;

app.use(express.json());

// Routes
app.use('/summarize', summarizeRouter);

// Health check
app.get('/health', (_req, res) => {
  res.json({ status: 'ok', timestamp: new Date().toISOString() });
});

// Error handler
app.use((err, _req, res, _next) => {
  console.error(err.message);
  res.status(err.status ?? 500).json({ error: err.message });
});

app.listen(PORT, () => {
  console.log(`PDF Summarizer running on http://localhost:${PORT}`);
});

Testing the API

Start the server:

bash
npm run dev

Send a PDF via curl:

bash
curl -X POST http://localhost:3000/summarize \
  -F "pdf=@/path/to/your/document.pdf"

Example response:

json
{
  "success": true,
  "filename": "quarterly-report.pdf",
  "size": "342.8 KB",
  "summary": "The Q3 2025 report shows 23% revenue growth driven by enterprise subscriptions, with APAC emerging as the fastest-growing region at 41% YoY. Operating margins improved to 18% following the cloud infrastructure optimisation completed in July. The company is on track to exceed its full-year revenue guidance of $280M.",
  "keyPoints": [
    "Revenue grew 23% YoY to $68M in Q3",
    "Enterprise segment now represents 64% of total revenue",
    "APAC grew 41% — largest growth region",
    "Operating margin improved from 14% to 18%",
    "Headcount increased by 120 to 1,847 employees"
  ],
  "actionItems": [
    "Expand APAC sales team by Q1 2026",
    "Complete cloud migration of remaining on-prem workloads",
    "Review enterprise pricing model for 2026"
  ]
}

Adding a CLI Interface

For batch processing, add a simple CLI:

js
// src/cli.js
import 'dotenv/config';
import { readFile } from 'fs/promises';
import { summarizePDF } from './services/pdfService.js';

const filePath = process.argv[2];

if (!filePath) {
  console.error('Usage: node src/cli.js <path-to-pdf>');
  process.exit(1);
}

try {
  console.log(`Summarizing ${filePath}...`);
  const buffer = await readFile(filePath);
  const result = await summarizePDF(buffer);

  console.log('\n─── SUMMARY ─────────────────────────────');
  console.log(result.summary);
  console.log('\n─── KEY POINTS ──────────────────────────');
  result.keyPoints.forEach((p, i) => console.log(`${i + 1}. ${p}`));
  console.log('\n─── ACTION ITEMS ────────────────────────');
  result.actionItems.forEach((a, i) => console.log(`${i + 1}. ${a}`));
} catch (err) {
  console.error('Error:', err.message);
  process.exit(1);
}

Run from the terminal:

bash
node src/cli.js ~/Downloads/report.pdf

Batch Processing Multiple PDFs

js
// src/batch.js
import 'dotenv/config';
import { readFile, writeFile, readdir } from 'fs/promises';
import path from 'path';
import { summarizePDF } from './services/pdfService.js';

const PDF_DIR    = process.argv[2] || './pdfs';
const OUTPUT_DIR = process.argv[3] || './summaries';

const files = (await readdir(PDF_DIR)).filter(f => f.endsWith('.pdf'));
console.log(`Processing ${files.length} PDFs...`);

for (const file of files) {
  try {
    const buffer  = await readFile(path.join(PDF_DIR, file));
    const result  = await summarizePDF(buffer);
    const outFile = path.join(OUTPUT_DIR, `${path.basename(file, '.pdf')}.json`);

    await writeFile(outFile, JSON.stringify({ file, ...result }, null, 2));
    console.log(`✅ ${file}`);
  } catch (err) {
    console.error(`❌ ${file}: ${err.message}`);
  }
}

console.log('\nBatch complete. Summaries saved to:', OUTPUT_DIR);

Model Selection Guide

ModelSpeedCostUse Case
gpt-4o-miniFast~$0.001/docStandard documents, batch processing
gpt-4oMedium~$0.01/docComplex technical docs, legal documents

For most use cases, gpt-4o-mini is the right choice — it produces excellent summaries at 10x lower cost.


Key Concepts Covered

pdf-parse reads the internal text stream of a PDF. It does not process scanned images — for that, use GPT-4o Vision (covered in Tool 4).

response_format: { type: 'json_object' } instructs GPT-4o to return valid JSON every time — no markdown fences, no explanation text, just the object. This is essential for programmatic use.

Map-reduce summarization splits large documents into chunks, summarizes each independently in parallel (via Promise.all), then synthesizes the partial summaries into a final result. This handles documents of any length without hitting context window limits.

memoryStorage in Multer keeps the uploaded PDF in memory as a Buffer — faster than writing to disk and simpler to clean up.


Build 50 AI Automation Tools — Tool 1 of 50

Your PDF summarizer is live. Continue to Tool 2 to build an AI resume parser that extracts structured candidate data.


    Summary

    You now have a production-ready PDF summarizer that:

    • Accepts PDF uploads via a REST API or CLI
    • Extracts text with pdf-parse and handles encoding edge cases
    • Splits large documents into chunks and applies map-reduce summarization
    • Uses response_format: json_object to guarantee structured JSON output every time
    • Returns summary, keyPoints, and actionItems — ready to use in any downstream app
    • Processes batches of PDFs from a directory with a single script

    Continue to Tool 2: Resume Parser & Skill Extractor →