AINode.jsAutomation

Document Comparison Tool with AI

Build an AI document comparison tool using Node.js and GPT-4o that identifies differences, additions, and removals between two document versions and explains what changed in plain English.

TT
Emily Ross
5 min read
Document Comparison Tool with AI

Document Comparison Tool with AI

Reviewing the differences between two contract versions, policy documents, or terms of service is time-consuming and error-prone. This tool compares any two text documents and returns a plain-English analysis of what changed, what was added, and what was removed — with a significance assessment for each change.

This is Tool 5 of the Build 50 AI Automation Tools course.


What You'll Build

  • POST /compare — upload two documents (or send text directly), receive structured diff analysis
  • Identifies additions, removals, and modifications with semantic explanations
  • Assigns significance scores so you focus on material changes first

Setup

bash
mkdir doc-compare && cd doc-compare
npm init -y
npm install express multer pdf-parse openai diff-match-patch dotenv
bash
# .env
OPENAI_API_KEY=sk-your-key-here

Comparison Service

js
// src/services/compareService.js
import pdfParse from 'pdf-parse';
import OpenAI from 'openai';
import { diff_match_patch } from 'diff-match-patch';

const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
const dmp = new diff_match_patch();

async function extractText(buffer, mimetype) {
  if (mimetype === 'application/pdf') {
    const { text } = await pdfParse(buffer);
    return text;
  }
  return buffer.toString('utf-8');
}

function getChangedRegions(textA, textB) {
  // Use Myers diff to find changed regions
  const diffs = dmp.diff_main(textA, textB);
  dmp.diff_cleanupSemantic(diffs);

  const additions = diffs.filter(([op]) => op === 1).map(([, text]) => text.trim()).filter(Boolean);
  const removals  = diffs.filter(([op]) => op === -1).map(([, text]) => text.trim()).filter(Boolean);

  return { additions, removals };
}

export async function compareDocuments(bufferA, mimetypeA, bufferB, mimetypeB) {
  const [textA, textB] = await Promise.all([
    extractText(bufferA, mimetypeA),
    extractText(bufferB, mimetypeB),
  ]);

  const { additions, removals } = getChangedRegions(textA, textB);

  // If documents are very similar, skip AI call
  if (additions.length === 0 && removals.length === 0) {
    return { identical: true, changes: [], summary: 'The documents are identical.' };
  }

  const changedContent = `
ADDED TEXT:
${additions.slice(0, 50).join('\n---\n')}

REMOVED TEXT:
${removals.slice(0, 50).join('\n---\n')}`;

  const response = await openai.chat.completions.create({
    model: 'gpt-4o',
    messages: [
      {
        role: 'system',
        content: `You are an expert document analyst. Analyze the text additions and removals between two document versions.
Return ONLY a JSON object — no markdown:
{
  "identical": false,
  "summary": "2-3 sentence plain-English summary of what changed and why it matters",
  "changes": [
    {
      "type": "addition | removal | modification",
      "section": "best guess at which section this change is in",
      "description": "plain-English explanation of this specific change",
      "significance": "high | medium | low",
      "addedText": "relevant added text or null",
      "removedText": "relevant removed text or null"
    }
  ],
  "overallSignificance": "high | medium | low",
  "recommendedAction": "string — what the reviewer should do next"
}`,
      },
      { role: 'user', content: changedContent },
    ],
    temperature: 0.2,
    response_format: { type: 'json_object' },
  });

  return JSON.parse(response.choices[0].message.content);
}

API Route

js
// src/routes/compare.js
import { Router } from 'express';
import multer from 'multer';
import { compareDocuments } from '../services/compareService.js';

const router = Router();
const upload = multer({ storage: multer.memoryStorage(), limits: { fileSize: 10 * 1024 * 1024 } });

router.post('/', upload.fields([{ name: 'docA', maxCount: 1 }, { name: 'docB', maxCount: 1 }]), async (req, res, next) => {
  try {
    const docA = req.files?.docA?.[0];
    const docB = req.files?.docB?.[0];
    if (!docA || !docB) return res.status(400).json({ error: 'Upload two documents: docA and docB' });

    const result = await compareDocuments(docA.buffer, docA.mimetype, docB.buffer, docB.mimetype);
    res.json({ success: true, result });
  } catch (err) { next(err); }
});

export default router;

Server

js
// src/server.js
import 'dotenv/config';
import express from 'express';
import compareRouter from './routes/compare.js';

const app = express();
app.use(express.json());
app.use('/compare', compareRouter);
app.use((err, _req, res, _next) => res.status(500).json({ error: err.message }));
app.listen(process.env.PORT ?? 3000, () => console.log('Doc Compare running'));

Testing

bash
curl -X POST http://localhost:3000/compare \
  -F "docA=@contract-v1.pdf" \
  -F "docB=@contract-v2.pdf"

Sample response:

json
{
  "summary": "Version 2 reduces the liability cap from 12 months to 3 months of fees and adds a mandatory arbitration clause replacing litigation. These are material changes that significantly increase risk for the customer.",
  "overallSignificance": "high",
  "changes": [
    {
      "type": "modification",
      "section": "Section 8 — Limitation of Liability",
      "description": "Liability cap reduced from 12 months of fees to 3 months",
      "significance": "high",
      "removedText": "not exceed the total fees paid in the twelve (12) months",
      "addedText": "not exceed the total fees paid in the three (3) months"
    },
    {
      "type": "addition",
      "section": "Section 12 — Dispute Resolution",
      "description": "Mandatory binding arbitration clause added, waiving right to jury trial",
      "significance": "high",
      "addedText": "Any dispute shall be resolved by binding arbitration..."
    }
  ],
  "recommendedAction": "Escalate to legal counsel before signing. The liability cap reduction and arbitration waiver are non-standard and significantly favour the vendor."
}

Build 50 AI Automation Tools — Tool 5 of 50

Document comparison is done. Continue to Tool 6 to convert meeting notes into structured action items.


    Summary

    • diff-match-patch provides a fast, accurate text diff before involving the AI
    • Sending only changed regions to GPT-4o dramatically reduces token cost and latency
    • The significance score lets users triage changes — review high-significance items first
    • Works on PDFs, plain text, and Markdown — extend with mammoth for Word documents

    Continue to Tool 6: Meeting Notes to Action Items →