AINode.jsAutomation
Image Caption Generator with GPT-4o Vision
TT
TopicTrick Team
Image Caption Generator with GPT-4o Vision
Writing alt text, captions, and social media copy for images is tedious at scale. This tool uploads any image and generates descriptive captions, accessibility alt text, SEO-optimised descriptions, and platform-specific social media copy using GPT-4o Vision.
This is Tool 17 of the Build 50 AI Automation Tools course.
What You'll Build
POST /caption— upload an image, receive captions for multiple use casesPOST /caption/url— provide an image URL instead of uploadingPOST /caption/batch— caption multiple images in one request
Setup
bash
mkdir image-captioner && cd image-captioner
npm init -y
npm install express multer openai dotenvbash
# .env
OPENAI_API_KEY=sk-your-key-here
PORT=3000Vision Caption Service
js
// src/services/visionService.js
import OpenAI from 'openai';
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
function imageToDataUrl(buffer, mimetype) {
return `data:${mimetype};base64,${buffer.toString('base64')}`;
}
export async function generateCaptions(imageSource, options = {}) {
const { tone = 'professional', context = '', detail = 'high' } = options;
const imageContent = typeof imageSource === 'string'
? { type: 'image_url', image_url: { url: imageSource, detail } }
: { type: 'image_url', image_url: { url: imageSource.dataUrl, detail } };
const contextNote = context ? `Additional context: ${context}` : '';
const toneNote = tone === 'casual' ? 'Use a casual, engaging tone.' : 'Use a professional, descriptive tone.';
const response = await openai.chat.completions.create({
model: 'gpt-4o',
messages: [
{
role: 'system',
content: `You are an expert image analyst and copywriter. ${toneNote} ${contextNote}
Analyze the image and return ONLY a JSON object — no markdown:
{
"description": "2-3 sentence detailed description of what is in the image",
"accessibilityAlt": "concise alt text under 125 chars for screen readers — describe the visual content factually",
"seoAlt": "SEO-optimised alt text under 125 chars — descriptive and keyword-rich",
"socialMediaCaptions": {
"instagram": "engaging caption with emojis and hashtags (max 220 chars)",
"twitter": "concise tweet caption (max 240 chars, no hashtags)",
"linkedin": "professional caption with 1-2 relevant hashtags (max 300 chars)"
},
"imageType": "photograph | illustration | diagram | infographic | screenshot | artwork | other",
"dominantColors": ["top 3 dominant colors by name"],
"mood": "string — overall mood or feeling of the image",
"subjects": ["main subjects/objects in the image"],
"suggestedFilename": "descriptive-filename-without-extension"
}`,
},
{
role: 'user',
content: [
{ type: 'text', text: 'Generate captions for this image:' },
imageContent,
],
},
],
temperature: 0.5,
response_format: { type: 'json_object' },
});
return JSON.parse(response.choices[0].message.content);
}API Routes + Server
js
// src/server.js
import 'dotenv/config';
import express from 'express';
import multer from 'multer';
import { generateCaptions } from './services/visionService.js';
const app = express();
app.use(express.json());
const upload = multer({
storage: multer.memoryStorage(),
limits: { fileSize: 20 * 1024 * 1024 },
fileFilter: (_req, file, cb) => {
const allowed = ['image/jpeg', 'image/png', 'image/gif', 'image/webp'];
allowed.includes(file.mimetype) ? cb(null, true) : cb(new Error('Images only'));
},
});
// Upload-based captioning
app.post('/caption', upload.single('image'), async (req, res, next) => {
try {
if (!req.file) return res.status(400).json({ error: 'No image uploaded' });
const dataUrl = `data:${req.file.mimetype};base64,${req.file.buffer.toString('base64')}`;
const options = { tone: req.body.tone, context: req.body.context, detail: req.body.detail };
const result = await generateCaptions({ dataUrl }, options);
res.json({ success: true, filename: req.file.originalname, ...result });
} catch (err) { next(err); }
});
// URL-based captioning
app.post('/caption/url', async (req, res, next) => {
try {
const { url, tone, context, detail } = req.body;
if (!url) return res.status(400).json({ error: 'image url required' });
const result = await generateCaptions(url, { tone, context, detail });
res.json({ success: true, url, ...result });
} catch (err) { next(err); }
});
// Batch URL captioning
app.post('/caption/batch', async (req, res, next) => {
try {
const { urls, tone, context } = req.body;
if (!urls?.length) return res.status(400).json({ error: 'urls array required' });
const results = await Promise.allSettled(
urls.map(url => generateCaptions(url, { tone, context, detail: 'low' }))
);
res.json({
success: true,
results: results.map((r, i) =>
r.status === 'fulfilled' ? { url: urls[i], ...r.value } : { url: urls[i], error: r.reason.message }
),
});
} catch (err) { next(err); }
});
app.get('/health', (_req, res) => res.json({ status: 'ok' }));
app.use((err, _req, res, _next) => res.status(500).json({ error: err.message }));
app.listen(process.env.PORT ?? 3000, () => console.log('Image Captioner running'));Testing
bash
# Upload an image
curl -X POST http://localhost:3000/caption \
-F "image=@product-photo.jpg" \
-F "context=luxury watch brand" \
-F "tone=professional"
# Use an image URL
curl -X POST http://localhost:3000/caption/url \
-H "Content-Type: application/json" \
-d '{"url": "https://example.com/photo.jpg", "tone": "casual"}'Sample response:
json
{
"description": "A close-up photograph of a stainless steel mechanical watch on a dark leather strap, resting on a wooden surface. The watch face shows Roman numerals and a moon phase complication. Natural light catches the brushed steel bezel.",
"accessibilityAlt": "Stainless steel mechanical watch with Roman numerals and moon phase on dark leather strap",
"seoAlt": "Luxury stainless steel mechanical watch with moon phase complication on leather strap",
"socialMediaCaptions": {
"instagram": "Time is the ultimate luxury. ⌚✨ Crafted for those who appreciate the art of precision. #watchmaking #luxurywatches #mechanicalwatch #horology",
"twitter": "There's something timeless about a mechanical watch. Every tick is a reminder that craftsmanship never goes out of style.",
"linkedin": "Precision engineering meets timeless design. This is what happens when craftsmanship and innovation work together. #LuxuryGoods #ManufacturingExcellence"
},
"imageType": "photograph",
"dominantColors": ["silver", "dark brown", "cream"],
"mood": "elegant and refined",
"subjects": ["mechanical watch", "leather strap", "wooden surface"],
"suggestedFilename": "stainless-steel-mechanical-watch-moon-phase-leather-strap"
}Website Alt Text Automation
js
import * as cheerio from 'cheerio';
import axios from 'axios';
async function fixMissingAltText(siteUrl) {
const { data } = await axios.get(siteUrl);
const $ = cheerio.load(data);
const imgsMissingAlt = [];
$('img').each((_, el) => {
const src = $(el).attr('src');
const alt = $(el).attr('alt');
if (src && !alt) imgsMissingAlt.push(src);
});
const results = await Promise.all(imgsMissingAlt.map(async src => {
const fullUrl = src.startsWith('http') ? src : new URL(src, siteUrl).href;
const caption = await generateCaptions(fullUrl, { detail: 'low' });
return { src, suggestedAlt: caption.accessibilityAlt };
}));
return results;
}Build 50 AI Automation Tools — Tool 17 of 50
Image captioning is live. Continue to Tool 18 to build an AI product image analyzer for e-commerce.
Summary
- Base64 encoding enables image upload without requiring public URLs
- detail: 'low' for batch processing, detail: 'high' for detailed single-image analysis
- Platform-specific captions generate once and adapt the copy to Instagram, Twitter, and LinkedIn
- Accessibility alt text is factual and concise — different from the SEO-optimised version
- The website automation example scales alt text generation across an entire site's images
Continue to Tool 18: Product Image Analyzer for E-commerce →
