AINode.jsAutomation

Image Caption Generator with GPT-4o Vision

TT
TopicTrick Team
Image Caption Generator with GPT-4o Vision

Image Caption Generator with GPT-4o Vision

Writing alt text, captions, and social media copy for images is tedious at scale. This tool uploads any image and generates descriptive captions, accessibility alt text, SEO-optimised descriptions, and platform-specific social media copy using GPT-4o Vision.

This is Tool 17 of the Build 50 AI Automation Tools course.


What You'll Build

  • POST /caption — upload an image, receive captions for multiple use cases
  • POST /caption/url — provide an image URL instead of uploading
  • POST /caption/batch — caption multiple images in one request

Setup

bash
mkdir image-captioner && cd image-captioner
npm init -y
npm install express multer openai dotenv
bash
# .env
OPENAI_API_KEY=sk-your-key-here
PORT=3000

Vision Caption Service

js
// src/services/visionService.js
import OpenAI from 'openai';

const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

function imageToDataUrl(buffer, mimetype) {
  return `data:${mimetype};base64,${buffer.toString('base64')}`;
}

export async function generateCaptions(imageSource, options = {}) {
  const { tone = 'professional', context = '', detail = 'high' } = options;

  const imageContent = typeof imageSource === 'string'
    ? { type: 'image_url', image_url: { url: imageSource, detail } }
    : { type: 'image_url', image_url: { url: imageSource.dataUrl, detail } };

  const contextNote = context ? `Additional context: ${context}` : '';
  const toneNote = tone === 'casual' ? 'Use a casual, engaging tone.' : 'Use a professional, descriptive tone.';

  const response = await openai.chat.completions.create({
    model: 'gpt-4o',
    messages: [
      {
        role: 'system',
        content: `You are an expert image analyst and copywriter. ${toneNote} ${contextNote}
Analyze the image and return ONLY a JSON object — no markdown:
{
  "description": "2-3 sentence detailed description of what is in the image",
  "accessibilityAlt": "concise alt text under 125 chars for screen readers — describe the visual content factually",
  "seoAlt": "SEO-optimised alt text under 125 chars — descriptive and keyword-rich",
  "socialMediaCaptions": {
    "instagram": "engaging caption with emojis and hashtags (max 220 chars)",
    "twitter": "concise tweet caption (max 240 chars, no hashtags)",
    "linkedin": "professional caption with 1-2 relevant hashtags (max 300 chars)"
  },
  "imageType": "photograph | illustration | diagram | infographic | screenshot | artwork | other",
  "dominantColors": ["top 3 dominant colors by name"],
  "mood": "string — overall mood or feeling of the image",
  "subjects": ["main subjects/objects in the image"],
  "suggestedFilename": "descriptive-filename-without-extension"
}`,
      },
      {
        role: 'user',
        content: [
          { type: 'text', text: 'Generate captions for this image:' },
          imageContent,
        ],
      },
    ],
    temperature: 0.5,
    response_format: { type: 'json_object' },
  });

  return JSON.parse(response.choices[0].message.content);
}

API Routes + Server

js
// src/server.js
import 'dotenv/config';
import express from 'express';
import multer from 'multer';
import { generateCaptions } from './services/visionService.js';

const app = express();
app.use(express.json());
const upload = multer({
  storage: multer.memoryStorage(),
  limits: { fileSize: 20 * 1024 * 1024 },
  fileFilter: (_req, file, cb) => {
    const allowed = ['image/jpeg', 'image/png', 'image/gif', 'image/webp'];
    allowed.includes(file.mimetype) ? cb(null, true) : cb(new Error('Images only'));
  },
});

// Upload-based captioning
app.post('/caption', upload.single('image'), async (req, res, next) => {
  try {
    if (!req.file) return res.status(400).json({ error: 'No image uploaded' });
    const dataUrl = `data:${req.file.mimetype};base64,${req.file.buffer.toString('base64')}`;
    const options = { tone: req.body.tone, context: req.body.context, detail: req.body.detail };
    const result = await generateCaptions({ dataUrl }, options);
    res.json({ success: true, filename: req.file.originalname, ...result });
  } catch (err) { next(err); }
});

// URL-based captioning
app.post('/caption/url', async (req, res, next) => {
  try {
    const { url, tone, context, detail } = req.body;
    if (!url) return res.status(400).json({ error: 'image url required' });
    const result = await generateCaptions(url, { tone, context, detail });
    res.json({ success: true, url, ...result });
  } catch (err) { next(err); }
});

// Batch URL captioning
app.post('/caption/batch', async (req, res, next) => {
  try {
    const { urls, tone, context } = req.body;
    if (!urls?.length) return res.status(400).json({ error: 'urls array required' });
    const results = await Promise.allSettled(
      urls.map(url => generateCaptions(url, { tone, context, detail: 'low' }))
    );
    res.json({
      success: true,
      results: results.map((r, i) =>
        r.status === 'fulfilled' ? { url: urls[i], ...r.value } : { url: urls[i], error: r.reason.message }
      ),
    });
  } catch (err) { next(err); }
});

app.get('/health', (_req, res) => res.json({ status: 'ok' }));
app.use((err, _req, res, _next) => res.status(500).json({ error: err.message }));
app.listen(process.env.PORT ?? 3000, () => console.log('Image Captioner running'));

Testing

bash
# Upload an image
curl -X POST http://localhost:3000/caption \
  -F "image=@product-photo.jpg" \
  -F "context=luxury watch brand" \
  -F "tone=professional"

# Use an image URL
curl -X POST http://localhost:3000/caption/url \
  -H "Content-Type: application/json" \
  -d '{"url": "https://example.com/photo.jpg", "tone": "casual"}'

Sample response:

json
{
  "description": "A close-up photograph of a stainless steel mechanical watch on a dark leather strap, resting on a wooden surface. The watch face shows Roman numerals and a moon phase complication. Natural light catches the brushed steel bezel.",
  "accessibilityAlt": "Stainless steel mechanical watch with Roman numerals and moon phase on dark leather strap",
  "seoAlt": "Luxury stainless steel mechanical watch with moon phase complication on leather strap",
  "socialMediaCaptions": {
    "instagram": "Time is the ultimate luxury. ⌚✨ Crafted for those who appreciate the art of precision. #watchmaking #luxurywatches #mechanicalwatch #horology",
    "twitter": "There's something timeless about a mechanical watch. Every tick is a reminder that craftsmanship never goes out of style.",
    "linkedin": "Precision engineering meets timeless design. This is what happens when craftsmanship and innovation work together. #LuxuryGoods #ManufacturingExcellence"
  },
  "imageType": "photograph",
  "dominantColors": ["silver", "dark brown", "cream"],
  "mood": "elegant and refined",
  "subjects": ["mechanical watch", "leather strap", "wooden surface"],
  "suggestedFilename": "stainless-steel-mechanical-watch-moon-phase-leather-strap"
}

Website Alt Text Automation

js
import * as cheerio from 'cheerio';
import axios from 'axios';

async function fixMissingAltText(siteUrl) {
  const { data } = await axios.get(siteUrl);
  const $ = cheerio.load(data);
  const imgsMissingAlt = [];

  $('img').each((_, el) => {
    const src = $(el).attr('src');
    const alt = $(el).attr('alt');
    if (src && !alt) imgsMissingAlt.push(src);
  });

  const results = await Promise.all(imgsMissingAlt.map(async src => {
    const fullUrl = src.startsWith('http') ? src : new URL(src, siteUrl).href;
    const caption = await generateCaptions(fullUrl, { detail: 'low' });
    return { src, suggestedAlt: caption.accessibilityAlt };
  }));

  return results;
}

Build 50 AI Automation Tools — Tool 17 of 50

Image captioning is live. Continue to Tool 18 to build an AI product image analyzer for e-commerce.


    Summary

    • Base64 encoding enables image upload without requiring public URLs
    • detail: 'low' for batch processing, detail: 'high' for detailed single-image analysis
    • Platform-specific captions generate once and adapt the copy to Instagram, Twitter, and LinkedIn
    • Accessibility alt text is factual and concise — different from the SEO-optimised version
    • The website automation example scales alt text generation across an entire site's images

    Continue to Tool 18: Product Image Analyzer for E-commerce →