AINode.jsAutomation

Document Comparison Tool with AI

TT
TopicTrick Team
Document Comparison Tool with AI

Document Comparison Tool with AI

Reviewing the differences between two contract versions, policy documents, or terms of service is time-consuming and error-prone. This tool compares any two text documents and returns a plain-English analysis of what changed, what was added, and what was removed — with a significance assessment for each change.

This is Tool 5 of the Build 50 AI Automation Tools course.


What You'll Build

  • POST /compare — upload two documents (or send text directly), receive structured diff analysis
  • Identifies additions, removals, and modifications with semantic explanations
  • Assigns significance scores so you focus on material changes first

Setup

bash
mkdir doc-compare && cd doc-compare
npm init -y
npm install express multer pdf-parse openai diff-match-patch dotenv
bash
# .env
OPENAI_API_KEY=sk-your-key-here

Comparison Service

js
// src/services/compareService.js
import pdfParse from 'pdf-parse';
import OpenAI from 'openai';
import { diff_match_patch } from 'diff-match-patch';

const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
const dmp = new diff_match_patch();

async function extractText(buffer, mimetype) {
  if (mimetype === 'application/pdf') {
    const { text } = await pdfParse(buffer);
    return text;
  }
  return buffer.toString('utf-8');
}

function getChangedRegions(textA, textB) {
  // Use Myers diff to find changed regions
  const diffs = dmp.diff_main(textA, textB);
  dmp.diff_cleanupSemantic(diffs);

  const additions = diffs.filter(([op]) => op === 1).map(([, text]) => text.trim()).filter(Boolean);
  const removals  = diffs.filter(([op]) => op === -1).map(([, text]) => text.trim()).filter(Boolean);

  return { additions, removals };
}

export async function compareDocuments(bufferA, mimetypeA, bufferB, mimetypeB) {
  const [textA, textB] = await Promise.all([
    extractText(bufferA, mimetypeA),
    extractText(bufferB, mimetypeB),
  ]);

  const { additions, removals } = getChangedRegions(textA, textB);

  // If documents are very similar, skip AI call
  if (additions.length === 0 && removals.length === 0) {
    return { identical: true, changes: [], summary: 'The documents are identical.' };
  }

  const changedContent = `
ADDED TEXT:
${additions.slice(0, 50).join('\n---\n')}

REMOVED TEXT:
${removals.slice(0, 50).join('\n---\n')}`;

  const response = await openai.chat.completions.create({
    model: 'gpt-4o',
    messages: [
      {
        role: 'system',
        content: `You are an expert document analyst. Analyze the text additions and removals between two document versions.
Return ONLY a JSON object — no markdown:
{
  "identical": false,
  "summary": "2-3 sentence plain-English summary of what changed and why it matters",
  "changes": [
    {
      "type": "addition | removal | modification",
      "section": "best guess at which section this change is in",
      "description": "plain-English explanation of this specific change",
      "significance": "high | medium | low",
      "addedText": "relevant added text or null",
      "removedText": "relevant removed text or null"
    }
  ],
  "overallSignificance": "high | medium | low",
  "recommendedAction": "string — what the reviewer should do next"
}`,
      },
      { role: 'user', content: changedContent },
    ],
    temperature: 0.2,
    response_format: { type: 'json_object' },
  });

  return JSON.parse(response.choices[0].message.content);
}

API Route

js
// src/routes/compare.js
import { Router } from 'express';
import multer from 'multer';
import { compareDocuments } from '../services/compareService.js';

const router = Router();
const upload = multer({ storage: multer.memoryStorage(), limits: { fileSize: 10 * 1024 * 1024 } });

router.post('/', upload.fields([{ name: 'docA', maxCount: 1 }, { name: 'docB', maxCount: 1 }]), async (req, res, next) => {
  try {
    const docA = req.files?.docA?.[0];
    const docB = req.files?.docB?.[0];
    if (!docA || !docB) return res.status(400).json({ error: 'Upload two documents: docA and docB' });

    const result = await compareDocuments(docA.buffer, docA.mimetype, docB.buffer, docB.mimetype);
    res.json({ success: true, result });
  } catch (err) { next(err); }
});

export default router;

Server

js
// src/server.js
import 'dotenv/config';
import express from 'express';
import compareRouter from './routes/compare.js';

const app = express();
app.use(express.json());
app.use('/compare', compareRouter);
app.use((err, _req, res, _next) => res.status(500).json({ error: err.message }));
app.listen(process.env.PORT ?? 3000, () => console.log('Doc Compare running'));

Testing

bash
curl -X POST http://localhost:3000/compare \
  -F "docA=@contract-v1.pdf" \
  -F "docB=@contract-v2.pdf"

Sample response:

json
{
  "summary": "Version 2 reduces the liability cap from 12 months to 3 months of fees and adds a mandatory arbitration clause replacing litigation. These are material changes that significantly increase risk for the customer.",
  "overallSignificance": "high",
  "changes": [
    {
      "type": "modification",
      "section": "Section 8 — Limitation of Liability",
      "description": "Liability cap reduced from 12 months of fees to 3 months",
      "significance": "high",
      "removedText": "not exceed the total fees paid in the twelve (12) months",
      "addedText": "not exceed the total fees paid in the three (3) months"
    },
    {
      "type": "addition",
      "section": "Section 12 — Dispute Resolution",
      "description": "Mandatory binding arbitration clause added, waiving right to jury trial",
      "significance": "high",
      "addedText": "Any dispute shall be resolved by binding arbitration..."
    }
  ],
  "recommendedAction": "Escalate to legal counsel before signing. The liability cap reduction and arbitration waiver are non-standard and significantly favour the vendor."
}

Build 50 AI Automation Tools — Tool 5 of 50

Document comparison is done. Continue to Tool 6 to convert meeting notes into structured action items.


    Summary

    • diff-match-patch provides a fast, accurate text diff before involving the AI
    • Sending only changed regions to GPT-4o dramatically reduces token cost and latency
    • The significance score lets users triage changes — review high-significance items first
    • Works on PDFs, plain text, and Markdown — extend with mammoth for Word documents

    Continue to Tool 6: Meeting Notes to Action Items →