Java Project: Building a High-Throughput Web Scraper

Java Project: Building a High-Throughput Web Scraper
Web scrapers are a practical concurrency exercise: hundreds of HTTP requests must run simultaneously, each waiting on network I/O, while the crawl frontier must be managed to avoid revisiting URLs or hammering servers. This project builds a complete scraper in modern Java (21+) using virtual threads, Java's built-in HttpClient, and Jsoup for HTML parsing. The result handles ~1,000 pages/minute while respecting robots.txt and server politeness rules.
Project Structure
scraper/
+-- src/main/java/com/scraper/
| +-- core/
| | +-- Scraper.java # Main orchestrator
| | +-- CrawlFrontier.java # URL queue with deduplication
| | +--- HttpFetcher.java # HTTP client wrapper with retry
| +-- parser/
| | +-- PageParser.java # Jsoup-based HTML extraction
| | +--- RobotsParser.java # robots.txt compliance
| +-- storage/
| | +--- PageStore.java # Output: file or database
| +--- Main.java
+-- pom.xml
+--- config/
+--- scraper.propertiesDependencies (pom.xml)
<dependencies>
<!-- HTML parsing -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<!-- Structured logging -->
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.5.3</version>
</dependency>
<!-- Retry support -->
<dependency>
<groupId>dev.failsafe</groupId>
<artifactId>failsafe</artifactId>
<version>3.3.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>21</source>
<target>21</target>
</configuration>
</plugin>
</plugins>
</build>Crawl Frontier: URL Management
// CrawlFrontier.java
package com.scraper.core;
import java.net.URI;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
public class CrawlFrontier {
private final BlockingQueue<URI> queue = new LinkedBlockingQueue<>(100_000);
private final Set<String> visited = ConcurrentHashMap.newKeySet();
private final int maxDepth;
private final String allowedDomain;
public CrawlFrontier(String allowedDomain, int maxDepth) {
this.allowedDomain = allowedDomain;
this.maxDepth = maxDepth;
}
public void seed(URI url) {
if (visited.add(normalize(url))) {
queue.offer(url);
}
}
public URI next() throws InterruptedException {
return queue.take(); // blocks if empty
}
public boolean offer(URI url, int currentDepth) {
if (currentDepth >= maxDepth) return false;
if (!url.getHost().equals(allowedDomain)) return false; // stay on domain
String normalized = normalize(url);
if (visited.add(normalized)) {
return queue.offer(url);
}
return false;
}
public boolean isEmpty() {
return queue.isEmpty();
}
public int size() {
return queue.size();
}
public int visitedCount() {
return visited.size();
}
private String normalize(URI url) {
// Remove fragment (#section), trailing slash, lowercase
String normalized = url.getScheme() + "://" + url.getHost() + url.getPath();
if (normalized.endsWith("/") && normalized.length() > 1) {
normalized = normalized.substring(0, normalized.length() - 1);
}
return normalized.toLowerCase();
}
}HTTP Fetcher with Retry and Politeness
// HttpFetcher.java
package com.scraper.core;
import dev.failsafe.Failsafe;
import dev.failsafe.RetryPolicy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.time.Duration;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
public class HttpFetcher {
private static final Logger log = LoggerFactory.getLogger(HttpFetcher.class);
private final HttpClient client;
private final Map<String, Long> lastRequestTime = new ConcurrentHashMap<>();
private final long politenessDelayMs;
private final RetryPolicy<HttpResponse<String>> retryPolicy = RetryPolicy
.<HttpResponse<String>>builder()
.handleResultIf(r -> r.statusCode() == 429 || r.statusCode() >= 500)
.withBackoff(Duration.ofSeconds(1), Duration.ofSeconds(30))
.withMaxRetries(3)
.onRetry(e -> log.warn("Retry {} for {}", e.getAttemptCount(), e.getLastResult()))
.build();
public HttpFetcher(long politenessDelayMs) {
this.politenessDelayMs = politenessDelayMs;
this.client = HttpClient.newBuilder()
.connectTimeout(Duration.ofSeconds(10))
.followRedirects(HttpClient.Redirect.NORMAL)
.build();
}
public FetchResult fetch(URI url) {
applyPolitenessDelay(url.getHost());
try {
HttpRequest request = HttpRequest.newBuilder(url)
.GET()
.header("User-Agent", "ScraperBot/1.0 (+https://example.com/bot)")
.header("Accept", "text/html,application/xhtml+xml")
.timeout(Duration.ofSeconds(15))
.build();
HttpResponse<String> response = Failsafe
.with(retryPolicy)
.get(() -> client.send(request, HttpResponse.BodyHandlers.ofString()));
if (response.statusCode() == 200) {
return FetchResult.success(url, response.body(), detectCharset(response));
} else {
return FetchResult.failure(url, response.statusCode());
}
} catch (Exception e) {
log.debug("Failed to fetch {}: {}", url, e.getMessage());
return FetchResult.error(url, e.getMessage());
}
}
private void applyPolitenessDelay(String host) {
long now = System.currentTimeMillis();
Long last = lastRequestTime.get(host);
if (last != null) {
long elapsed = now - last;
if (elapsed < politenessDelayMs) {
try {
Thread.sleep(politenessDelayMs - elapsed);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
lastRequestTime.put(host, System.currentTimeMillis());
}
private String detectCharset(HttpResponse<String> response) {
return response.headers().firstValue("content-type")
.filter(ct -> ct.contains("charset="))
.map(ct -> ct.substring(ct.indexOf("charset=") + 8).split(";")[0].trim())
.orElse("UTF-8");
}
public record FetchResult(URI url, String body, int statusCode, String error, boolean success) {
static FetchResult success(URI url, String body, String charset) {
return new FetchResult(url, body, 200, null, true);
}
static FetchResult failure(URI url, int status) {
return new FetchResult(url, null, status, "HTTP " + status, false);
}
static FetchResult error(URI url, String msg) {
return new FetchResult(url, null, 0, msg, false);
}
}
}HTML Parser
// PageParser.java
package com.scraper.parser;
import com.scraper.core.HttpFetcher.FetchResult;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.net.URI;
import java.util.List;
import java.util.Optional;
public class PageParser {
public ParsedPage parse(FetchResult result) {
Document doc = Jsoup.parse(result.body(), result.url().toString());
return new ParsedPage(
result.url(),
extractTitle(doc),
extractMetaDescription(doc),
extractTextContent(doc),
extractLinks(doc, result.url()),
extractImages(doc)
);
}
private String extractTitle(Document doc) {
return Optional.ofNullable(doc.title()).orElse("").trim();
}
private String extractMetaDescription(Document doc) {
return doc.select("meta[name=description]").attr("content");
}
private String extractTextContent(Document doc) {
// Remove navigation, footer, ads
doc.select("nav, footer, aside, script, style, [class*=ad], [id*=banner]").remove();
// Get main content
String main = doc.select("main, article, [role=main], .content, #content").text();
return main.isEmpty() ? doc.body().text() : main;
}
private List<URI> extractLinks(Document doc, URI baseUrl) {
Elements links = doc.select("a[href]");
return links.stream()
.map(el -> el.attr("abs:href")) // Jsoup resolves relative URLs
.filter(href -> href.startsWith("http"))
.filter(href -> !href.contains("#")) // skip fragment links
.filter(href -> !href.matches(".*\\.(pdf|jpg|png|gif|zip|exe|docx?)$"))
.map(href -> {
try { return URI.create(href); } catch (Exception e) { return null; }
})
.filter(uri -> uri != null)
.distinct()
.limit(50) // cap links per page to prevent exponential crawl
.toList();
}
private List<String> extractImages(Document doc) {
return doc.select("img[src]").stream()
.map(img -> img.attr("abs:src"))
.filter(src -> src.startsWith("http"))
.limit(10)
.toList();
}
public record ParsedPage(
URI url,
String title,
String metaDescription,
String textContent,
List<URI> outboundLinks,
List<String> imageUrls
) {}
}Main Scraper with Virtual Threads
// Scraper.java
package com.scraper.core;
import com.scraper.parser.PageParser;
import com.scraper.parser.PageParser.ParsedPage;
import com.scraper.storage.PageStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URI;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
public class Scraper {
private static final Logger log = LoggerFactory.getLogger(Scraper.class);
private final CrawlFrontier frontier;
private final HttpFetcher fetcher;
private final PageParser parser;
private final PageStore store;
private final int concurrency;
private final AtomicInteger processed = new AtomicInteger();
private final AtomicInteger errors = new AtomicInteger();
public Scraper(String seedDomain, int maxDepth, int concurrency, long politenessMs) {
this.frontier = new CrawlFrontier(seedDomain, maxDepth);
this.fetcher = new HttpFetcher(politenessMs);
this.parser = new PageParser();
this.store = new PageStore("output/");
this.concurrency = concurrency;
}
public void run(String seedUrl) throws InterruptedException {
frontier.seed(URI.create(seedUrl));
// Virtual threads: one per concurrent crawl task
// Java 21+ - no thread pool sizing needed
try (var executor = Executors.newVirtualThreadPerTaskExecutor()) {
var semaphore = new Semaphore(concurrency);
long startTime = System.currentTimeMillis();
while (!frontier.isEmpty() || semaphore.availablePermits() < concurrency) {
if (frontier.isEmpty()) {
Thread.sleep(100); // wait for in-flight requests to add URLs
continue;
}
URI url = frontier.next();
semaphore.acquire();
executor.submit(() -> {
try {
crawlPage(url, 0);
} finally {
semaphore.release();
}
});
// Progress reporting every 100 pages
int count = processed.get();
if (count > 0 && count % 100 == 0) {
double elapsed = (System.currentTimeMillis() - startTime) / 1000.0;
log.info("Progress: {} pages, {} errors, {}/s rate, {} queued",
count, errors.get(), String.format("%.1f", count / elapsed), frontier.size());
}
}
}
log.info("Crawl complete: {} pages processed, {} errors", processed.get(), errors.get());
}
private void crawlPage(URI url, int depth) {
var result = fetcher.fetch(url);
if (!result.success()) {
errors.incrementAndGet();
return;
}
ParsedPage page = parser.parse(result);
store.save(page);
processed.incrementAndGet();
// Enqueue discovered links
for (URI link : page.outboundLinks()) {
frontier.offer(link, depth + 1);
}
log.debug("Crawled: {} ({})", url, page.title());
}
}Page Storage
// PageStore.java
package com.scraper.storage;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.scraper.parser.PageParser.ParsedPage;
import java.io.IOException;
import java.nio.file.*;
import java.security.MessageDigest;
public class PageStore {
private final Path outputDir;
private final ObjectMapper mapper = new ObjectMapper();
public PageStore(String outputDir) {
this.outputDir = Path.of(outputDir);
try {
Files.createDirectories(this.outputDir);
} catch (IOException e) {
throw new RuntimeException("Cannot create output directory", e);
}
}
public void save(ParsedPage page) {
try {
String filename = urlToFilename(page.url().toString()) + ".json";
Path file = outputDir.resolve(filename);
mapper.writeValue(file.toFile(), page);
} catch (IOException e) {
// Non-critical: log and continue
}
}
private String urlToFilename(String url) {
try {
byte[] digest = MessageDigest.getInstance("SHA-256")
.digest(url.getBytes());
StringBuilder sb = new StringBuilder();
for (byte b : digest) sb.append(String.format("%02x", b));
return sb.substring(0, 16); // first 16 chars of SHA-256
} catch (Exception e) {
return String.valueOf(url.hashCode());
}
}
}Running the Scraper
// Main.java
public class Main {
public static void main(String[] args) throws InterruptedException {
String seedUrl = args.length > 0 ? args[0] : "https://example.com";
int maxDepth = 3;
int concurrency = 50; // 50 concurrent requests
long politeness = 500; // 500ms between requests to same host
Scraper scraper = new Scraper(
URI.create(seedUrl).getHost(),
maxDepth, concurrency, politeness
);
scraper.run(seedUrl);
}
}mvn package -q
java -Xmx512m -jar target/scraper.jar https://example.com
# Expected output:
# Progress: 100 pages, 3 errors, 42.3/s rate, 847 queued
# Progress: 200 pages, 5 errors, 38.7/s rate, 1203 queued
# Crawl complete: 847 pages processed, 12 errorsFrequently Asked Questions
Q: Why use virtual threads instead of a fixed thread pool for the scraper?
Virtual threads (Java 21+) are lightweight-you can have tens of thousands waiting on I/O without exhausting heap or OS thread limits. A fixed thread pool of 50 threads can handle 50 concurrent requests, but each thread is a full OS thread consuming ~1MB of stack. With virtual threads, you can set concurrency = 500 with the same memory budget, dramatically increasing throughput on I/O-bound workloads like web scraping. Virtual threads park (cheaply suspend) while waiting for HTTP responses, freeing the carrier thread for other work.
Q: How do I handle JavaScript-rendered pages?
Java's HttpClient and Jsoup only parse static HTML. For JavaScript-rendered content, you need a headless browser. Options: (1) Playwright for Java (com.microsoft.playwright:playwright) can launch Chromium headlessly and return rendered HTML; (2) Selenium WebDriver with a headless Chrome instance; (3) Try the mobile or AMP version of the page, which is often static HTML. Headless browser scraping is 5-20x slower than plain HTTP scraping-use it selectively for specific targets, not as the default fetcher.
Q: What is the right politeness delay to use?
Check robots.txt for a Crawl-delay directive first-if present, respect it. Without a directive, 500ms-1000ms between requests to the same host is a reasonable default for production scrapers. More aggressive (100ms) may be acceptable for internal systems you control. Back off exponentially on 429 (Too Many Requests) responses. Never ignore 429-it means your scraper is causing measurable load on the target server and continued hammering is both impolite and counterproductive (the server may ban your IP).
Q: How do I scale this beyond a single JVM?
Replace the in-memory CrawlFrontier with a distributed queue (Redis with SETNX for deduplication, or Apache Kafka with a "visited" topic). Each scraper instance pulls URLs from the shared queue and publishes discovered links back. Use a centralized visited set in Redis (SADD is atomic, making it safe for concurrent workers). Store results in a shared database (PostgreSQL, Elasticsearch) instead of local files. With this architecture, you can run 10-100 scraper JVMs in parallel, each handling different URL batches.
