PDF Summarizer with OpenAI and Node.js

PDF Summarizer with OpenAI and Node.js
Reading a 40-page report shouldn't take 40 minutes. This tool extracts text from any PDF and generates a structured summary — key points, main conclusions, and action items — in seconds using GPT-4o.
By the end of this module you will have a working REST API that accepts a PDF upload and returns a clean, structured JSON summary ready to use in any application.
This is Tool 1 of the Build 50 AI Automation Tools course.
What You'll Build
A Node.js Express API with one endpoint:
POST /summarize— accepts a PDF file upload, returns a structured summary- Handles PDFs of any length via text chunking
- Returns JSON with
summary,keyPoints, andactionItems
Project Setup
mkdir pdf-summarizer && cd pdf-summarizer
npm init -y
npm install express multer pdf-parse openai dotenvCreate the project structure:
pdf-summarizer/
├── src/
│ ├── server.js
│ ├── routes/summarize.js
│ └── services/pdfService.js
├── uploads/ # temporary storage for uploaded PDFs
├── .env
└── package.json// package.json
{
"type": "module",
"scripts": {
"start": "node src/server.js",
"dev": "node --watch src/server.js"
}
}Environment Variables
# .env
OPENAI_API_KEY=sk-your-openai-api-key-here
PORT=3000Get your API key from platform.openai.com/api-keys.
The PDF Service
This service handles text extraction and OpenAI communication:
// src/services/pdfService.js
import pdfParse from 'pdf-parse';
import OpenAI from 'openai';
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
// Max words per chunk — GPT-4o Mini handles ~6000 comfortably
const CHUNK_SIZE_WORDS = 6000;
/**
* Extract raw text from a PDF buffer.
*/
export async function extractTextFromPDF(buffer) {
const data = await pdfParse(buffer);
return data.text;
}
/**
* Split text into word-count chunks.
*/
function chunkText(text, maxWords = CHUNK_SIZE_WORDS) {
const words = text.split(/\s+/);
const chunks = [];
for (let i = 0; i < words.length; i += maxWords) {
chunks.push(words.slice(i, i + maxWords).join(' '));
}
return chunks;
}
/**
* Summarize a single chunk of text.
*/
async function summarizeChunk(text, chunkIndex, totalChunks) {
const context = totalChunks > 1
? `This is part ${chunkIndex + 1} of ${totalChunks} from a larger document. Summarize this section.`
: 'Summarize the following document.';
const response = await openai.chat.completions.create({
model: 'gpt-4o-mini',
messages: [
{
role: 'system',
content: `You are an expert document analyst. ${context}
Return a JSON object with exactly these fields:
{
"summary": "2-3 sentence overview",
"keyPoints": ["point 1", "point 2", "point 3"],
"actionItems": ["action 1", "action 2"]
}
Only return valid JSON, no markdown fences.`,
},
{
role: 'user',
content: text,
},
],
temperature: 0.3,
response_format: { type: 'json_object' },
});
return JSON.parse(response.choices[0].message.content);
}
/**
* Synthesize multiple chunk summaries into a final summary.
*/
async function synthesizeSummaries(chunkSummaries) {
const combined = chunkSummaries
.map((s, i) => `Section ${i + 1}:\nSummary: ${s.summary}\nKey Points: ${s.keyPoints.join(', ')}`)
.join('\n\n');
const response = await openai.chat.completions.create({
model: 'gpt-4o-mini',
messages: [
{
role: 'system',
content: `You are an expert document analyst. Combine these section summaries into a single coherent document summary.
Return a JSON object with exactly these fields:
{
"summary": "3-4 sentence executive summary of the entire document",
"keyPoints": ["top 5 key points from the whole document"],
"actionItems": ["all action items identified across all sections"]
}
Only return valid JSON, no markdown fences.`,
},
{
role: 'user',
content: combined,
},
],
temperature: 0.3,
response_format: { type: 'json_object' },
});
return JSON.parse(response.choices[0].message.content);
}
/**
* Main function: extract PDF text and return a structured summary.
*/
export async function summarizePDF(buffer) {
const text = await extractTextFromPDF(buffer);
if (!text || text.trim().length === 0) {
throw new Error('No extractable text found in PDF. The file may be scanned or image-based.');
}
const chunks = chunkText(text);
if (chunks.length === 1) {
// Short document — single API call
return await summarizeChunk(chunks[0], 0, 1);
}
// Long document — map-reduce pattern
const chunkSummaries = await Promise.all(
chunks.map((chunk, i) => summarizeChunk(chunk, i, chunks.length))
);
return await synthesizeSummaries(chunkSummaries);
}File Upload Route
// src/routes/summarize.js
import { Router } from 'express';
import multer from 'multer';
import { summarizePDF } from '../services/pdfService.js';
const router = Router();
// Store uploads in memory (buffer) — no disk writes needed
const upload = multer({
storage: multer.memoryStorage(),
limits: { fileSize: 50 * 1024 * 1024 }, // 50MB max
fileFilter: (_req, file, cb) => {
if (file.mimetype === 'application/pdf') {
cb(null, true);
} else {
cb(new Error('Only PDF files are accepted'), false);
}
},
});
router.post('/', upload.single('pdf'), async (req, res, next) => {
try {
if (!req.file) {
return res.status(400).json({ error: 'No PDF file provided. Use field name "pdf".' });
}
const result = await summarizePDF(req.file.buffer);
res.json({
success: true,
filename: req.file.originalname,
size: `${(req.file.size / 1024).toFixed(1)} KB`,
...result,
});
} catch (err) {
next(err);
}
});
export default router;Express Server
// src/server.js
import 'dotenv/config';
import express from 'express';
import summarizeRouter from './routes/summarize.js';
const app = express();
const PORT = process.env.PORT ?? 3000;
app.use(express.json());
// Routes
app.use('/summarize', summarizeRouter);
// Health check
app.get('/health', (_req, res) => {
res.json({ status: 'ok', timestamp: new Date().toISOString() });
});
// Error handler
app.use((err, _req, res, _next) => {
console.error(err.message);
res.status(err.status ?? 500).json({ error: err.message });
});
app.listen(PORT, () => {
console.log(`PDF Summarizer running on http://localhost:${PORT}`);
});Testing the API
Start the server:
npm run devSend a PDF via curl:
curl -X POST http://localhost:3000/summarize \
-F "pdf=@/path/to/your/document.pdf"Example response:
{
"success": true,
"filename": "quarterly-report.pdf",
"size": "342.8 KB",
"summary": "The Q3 2025 report shows 23% revenue growth driven by enterprise subscriptions, with APAC emerging as the fastest-growing region at 41% YoY. Operating margins improved to 18% following the cloud infrastructure optimisation completed in July. The company is on track to exceed its full-year revenue guidance of $280M.",
"keyPoints": [
"Revenue grew 23% YoY to $68M in Q3",
"Enterprise segment now represents 64% of total revenue",
"APAC grew 41% — largest growth region",
"Operating margin improved from 14% to 18%",
"Headcount increased by 120 to 1,847 employees"
],
"actionItems": [
"Expand APAC sales team by Q1 2026",
"Complete cloud migration of remaining on-prem workloads",
"Review enterprise pricing model for 2026"
]
}Adding a CLI Interface
For batch processing, add a simple CLI:
// src/cli.js
import 'dotenv/config';
import { readFile } from 'fs/promises';
import { summarizePDF } from './services/pdfService.js';
const filePath = process.argv[2];
if (!filePath) {
console.error('Usage: node src/cli.js <path-to-pdf>');
process.exit(1);
}
try {
console.log(`Summarizing ${filePath}...`);
const buffer = await readFile(filePath);
const result = await summarizePDF(buffer);
console.log('\n─── SUMMARY ─────────────────────────────');
console.log(result.summary);
console.log('\n─── KEY POINTS ──────────────────────────');
result.keyPoints.forEach((p, i) => console.log(`${i + 1}. ${p}`));
console.log('\n─── ACTION ITEMS ────────────────────────');
result.actionItems.forEach((a, i) => console.log(`${i + 1}. ${a}`));
} catch (err) {
console.error('Error:', err.message);
process.exit(1);
}Run from the terminal:
node src/cli.js ~/Downloads/report.pdfBatch Processing Multiple PDFs
// src/batch.js
import 'dotenv/config';
import { readFile, writeFile, readdir } from 'fs/promises';
import path from 'path';
import { summarizePDF } from './services/pdfService.js';
const PDF_DIR = process.argv[2] || './pdfs';
const OUTPUT_DIR = process.argv[3] || './summaries';
const files = (await readdir(PDF_DIR)).filter(f => f.endsWith('.pdf'));
console.log(`Processing ${files.length} PDFs...`);
for (const file of files) {
try {
const buffer = await readFile(path.join(PDF_DIR, file));
const result = await summarizePDF(buffer);
const outFile = path.join(OUTPUT_DIR, `${path.basename(file, '.pdf')}.json`);
await writeFile(outFile, JSON.stringify({ file, ...result }, null, 2));
console.log(`✅ ${file}`);
} catch (err) {
console.error(`❌ ${file}: ${err.message}`);
}
}
console.log('\nBatch complete. Summaries saved to:', OUTPUT_DIR);Model Selection Guide
| Model | Speed | Cost | Use Case |
|---|---|---|---|
gpt-4o-mini | Fast | ~$0.001/doc | Standard documents, batch processing |
gpt-4o | Medium | ~$0.01/doc | Complex technical docs, legal documents |
For most use cases, gpt-4o-mini is the right choice — it produces excellent summaries at 10x lower cost.
Key Concepts Covered
pdf-parse reads the internal text stream of a PDF. It does not process scanned images — for that, use GPT-4o Vision (covered in Tool 4).
response_format: { type: 'json_object' } instructs GPT-4o to return valid JSON every time — no markdown fences, no explanation text, just the object. This is essential for programmatic use.
Map-reduce summarization splits large documents into chunks, summarizes each independently in parallel (via Promise.all), then synthesizes the partial summaries into a final result. This handles documents of any length without hitting context window limits.
memoryStorage in Multer keeps the uploaded PDF in memory as a Buffer — faster than writing to disk and simpler to clean up.
Build 50 AI Automation Tools — Tool 1 of 50
Your PDF summarizer is live. Continue to Tool 2 to build an AI resume parser that extracts structured candidate data.
Summary
You now have a production-ready PDF summarizer that:
- Accepts PDF uploads via a REST API or CLI
- Extracts text with pdf-parse and handles encoding edge cases
- Splits large documents into chunks and applies map-reduce summarization
- Uses response_format: json_object to guarantee structured JSON output every time
- Returns
summary,keyPoints, andactionItems— ready to use in any downstream app - Processes batches of PDFs from a directory with a single script
Continue to Tool 2: Resume Parser & Skill Extractor →
