Java Project: Building a High-Throughput Web Scraper

Java Project: Building a High-Throughput Web Scraper
Web scrapers are a practical concurrency exercise: hundreds of HTTP requests must run simultaneously, each waiting on network I/O, while the crawl frontier must be managed to avoid revisiting URLs or hammering servers. This project builds a complete scraper in modern Java (21+) using virtual threads, Java's built-in HttpClient, and Jsoup for HTML parsing. The result handles ~1,000 pages/minute while respecting robots.txt and server politeness rules.
Project Structure
scraper/
├── src/main/java/com/scraper/
│ ├── core/
│ │ ├── Scraper.java # Main orchestrator
│ │ ├── CrawlFrontier.java # URL queue with deduplication
│ │ └── HttpFetcher.java # HTTP client wrapper with retry
│ ├── parser/
│ │ ├── PageParser.java # Jsoup-based HTML extraction
│ │ └── RobotsParser.java # robots.txt compliance
│ ├── storage/
│ │ └── PageStore.java # Output: file or database
│ └── Main.java
├── pom.xml
└── config/
└── scraper.propertiesDependencies (pom.xml)
<dependencies>
<!-- HTML parsing -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<!-- Structured logging -->
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.5.3</version>
</dependency>
<!-- Retry support -->
<dependency>
<groupId>dev.failsafe</groupId>
<artifactId>failsafe</artifactId>
<version>3.3.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>21</source>
<target>21</target>
</configuration>
</plugin>
</plugins>
</build>Crawl Frontier: URL Management
// CrawlFrontier.java
package com.scraper.core;
import java.net.URI;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
public class CrawlFrontier {
private final BlockingQueue<URI> queue = new LinkedBlockingQueue<>(100_000);
private final Set<String> visited = ConcurrentHashMap.newKeySet();
private final int maxDepth;
private final String allowedDomain;
public CrawlFrontier(String allowedDomain, int maxDepth) {
this.allowedDomain = allowedDomain;
this.maxDepth = maxDepth;
}
public void seed(URI url) {
if (visited.add(normalize(url))) {
queue.offer(url);
}
}
public URI next() throws InterruptedException {
return queue.take(); // blocks if empty
}
public boolean offer(URI url, int currentDepth) {
if (currentDepth >= maxDepth) return false;
if (!url.getHost().equals(allowedDomain)) return false; // stay on domain
String normalized = normalize(url);
if (visited.add(normalized)) {
return queue.offer(url);
}
return false;
}
public boolean isEmpty() {
return queue.isEmpty();
}
public int size() {
return queue.size();
}
public int visitedCount() {
return visited.size();
}
private String normalize(URI url) {
// Remove fragment (#section), trailing slash, lowercase
String normalized = url.getScheme() + "://" + url.getHost() + url.getPath();
if (normalized.endsWith("/") && normalized.length() > 1) {
normalized = normalized.substring(0, normalized.length() - 1);
}
return normalized.toLowerCase();
}
}HTTP Fetcher with Retry and Politeness
// HttpFetcher.java
package com.scraper.core;
import dev.failsafe.Failsafe;
import dev.failsafe.RetryPolicy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.time.Duration;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
public class HttpFetcher {
private static final Logger log = LoggerFactory.getLogger(HttpFetcher.class);
private final HttpClient client;
private final Map<String, Long> lastRequestTime = new ConcurrentHashMap<>();
private final long politenessDelayMs;
private final RetryPolicy<HttpResponse<String>> retryPolicy = RetryPolicy
.<HttpResponse<String>>builder()
.handleResultIf(r -> r.statusCode() == 429 || r.statusCode() >= 500)
.withBackoff(Duration.ofSeconds(1), Duration.ofSeconds(30))
.withMaxRetries(3)
.onRetry(e -> log.warn("Retry {} for {}", e.getAttemptCount(), e.getLastResult()))
.build();
public HttpFetcher(long politenessDelayMs) {
this.politenessDelayMs = politenessDelayMs;
this.client = HttpClient.newBuilder()
.connectTimeout(Duration.ofSeconds(10))
.followRedirects(HttpClient.Redirect.NORMAL)
.build();
}
public FetchResult fetch(URI url) {
applyPolitenessDelay(url.getHost());
try {
HttpRequest request = HttpRequest.newBuilder(url)
.GET()
.header("User-Agent", "ScraperBot/1.0 (+https://example.com/bot)")
.header("Accept", "text/html,application/xhtml+xml")
.timeout(Duration.ofSeconds(15))
.build();
HttpResponse<String> response = Failsafe
.with(retryPolicy)
.get(() -> client.send(request, HttpResponse.BodyHandlers.ofString()));
if (response.statusCode() == 200) {
return FetchResult.success(url, response.body(), detectCharset(response));
} else {
return FetchResult.failure(url, response.statusCode());
}
} catch (Exception e) {
log.debug("Failed to fetch {}: {}", url, e.getMessage());
return FetchResult.error(url, e.getMessage());
}
}
private void applyPolitenessDelay(String host) {
long now = System.currentTimeMillis();
Long last = lastRequestTime.get(host);
if (last != null) {
long elapsed = now - last;
if (elapsed < politenessDelayMs) {
try {
Thread.sleep(politenessDelayMs - elapsed);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
lastRequestTime.put(host, System.currentTimeMillis());
}
private String detectCharset(HttpResponse<String> response) {
return response.headers().firstValue("content-type")
.filter(ct -> ct.contains("charset="))
.map(ct -> ct.substring(ct.indexOf("charset=") + 8).split(";")[0].trim())
.orElse("UTF-8");
}
public record FetchResult(URI url, String body, int statusCode, String error, boolean success) {
static FetchResult success(URI url, String body, String charset) {
return new FetchResult(url, body, 200, null, true);
}
static FetchResult failure(URI url, int status) {
return new FetchResult(url, null, status, "HTTP " + status, false);
}
static FetchResult error(URI url, String msg) {
return new FetchResult(url, null, 0, msg, false);
}
}
}HTML Parser
// PageParser.java
package com.scraper.parser;
import com.scraper.core.HttpFetcher.FetchResult;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.net.URI;
import java.util.List;
import java.util.Optional;
public class PageParser {
public ParsedPage parse(FetchResult result) {
Document doc = Jsoup.parse(result.body(), result.url().toString());
return new ParsedPage(
result.url(),
extractTitle(doc),
extractMetaDescription(doc),
extractTextContent(doc),
extractLinks(doc, result.url()),
extractImages(doc)
);
}
private String extractTitle(Document doc) {
return Optional.ofNullable(doc.title()).orElse("").trim();
}
private String extractMetaDescription(Document doc) {
return doc.select("meta[name=description]").attr("content");
}
private String extractTextContent(Document doc) {
// Remove navigation, footer, ads
doc.select("nav, footer, aside, script, style, [class*=ad], [id*=banner]").remove();
// Get main content
String main = doc.select("main, article, [role=main], .content, #content").text();
return main.isEmpty() ? doc.body().text() : main;
}
private List<URI> extractLinks(Document doc, URI baseUrl) {
Elements links = doc.select("a[href]");
return links.stream()
.map(el -> el.attr("abs:href")) // Jsoup resolves relative URLs
.filter(href -> href.startsWith("http"))
.filter(href -> !href.contains("#")) // skip fragment links
.filter(href -> !href.matches(".*\\.(pdf|jpg|png|gif|zip|exe|docx?)$"))
.map(href -> {
try { return URI.create(href); } catch (Exception e) { return null; }
})
.filter(uri -> uri != null)
.distinct()
.limit(50) // cap links per page to prevent exponential crawl
.toList();
}
private List<String> extractImages(Document doc) {
return doc.select("img[src]").stream()
.map(img -> img.attr("abs:src"))
.filter(src -> src.startsWith("http"))
.limit(10)
.toList();
}
public record ParsedPage(
URI url,
String title,
String metaDescription,
String textContent,
List<URI> outboundLinks,
List<String> imageUrls
) {}
}Main Scraper with Virtual Threads
// Scraper.java
package com.scraper.core;
import com.scraper.parser.PageParser;
import com.scraper.parser.PageParser.ParsedPage;
import com.scraper.storage.PageStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URI;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
public class Scraper {
private static final Logger log = LoggerFactory.getLogger(Scraper.class);
private final CrawlFrontier frontier;
private final HttpFetcher fetcher;
private final PageParser parser;
private final PageStore store;
private final int concurrency;
private final AtomicInteger processed = new AtomicInteger();
private final AtomicInteger errors = new AtomicInteger();
public Scraper(String seedDomain, int maxDepth, int concurrency, long politenessMs) {
this.frontier = new CrawlFrontier(seedDomain, maxDepth);
this.fetcher = new HttpFetcher(politenessMs);
this.parser = new PageParser();
this.store = new PageStore("output/");
this.concurrency = concurrency;
}
public void run(String seedUrl) throws InterruptedException {
frontier.seed(URI.create(seedUrl));
// Virtual threads: one per concurrent crawl task
// Java 21+ — no thread pool sizing needed
try (var executor = Executors.newVirtualThreadPerTaskExecutor()) {
var semaphore = new Semaphore(concurrency);
long startTime = System.currentTimeMillis();
while (!frontier.isEmpty() || semaphore.availablePermits() < concurrency) {
if (frontier.isEmpty()) {
Thread.sleep(100); // wait for in-flight requests to add URLs
continue;
}
URI url = frontier.next();
semaphore.acquire();
executor.submit(() -> {
try {
crawlPage(url, 0);
} finally {
semaphore.release();
}
});
// Progress reporting every 100 pages
int count = processed.get();
if (count > 0 && count % 100 == 0) {
double elapsed = (System.currentTimeMillis() - startTime) / 1000.0;
log.info("Progress: {} pages, {} errors, {}/s rate, {} queued",
count, errors.get(), String.format("%.1f", count / elapsed), frontier.size());
}
}
}
log.info("Crawl complete: {} pages processed, {} errors", processed.get(), errors.get());
}
private void crawlPage(URI url, int depth) {
var result = fetcher.fetch(url);
if (!result.success()) {
errors.incrementAndGet();
return;
}
ParsedPage page = parser.parse(result);
store.save(page);
processed.incrementAndGet();
// Enqueue discovered links
for (URI link : page.outboundLinks()) {
frontier.offer(link, depth + 1);
}
log.debug("Crawled: {} ({})", url, page.title());
}
}Page Storage
// PageStore.java
package com.scraper.storage;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.scraper.parser.PageParser.ParsedPage;
import java.io.IOException;
import java.nio.file.*;
import java.security.MessageDigest;
public class PageStore {
private final Path outputDir;
private final ObjectMapper mapper = new ObjectMapper();
public PageStore(String outputDir) {
this.outputDir = Path.of(outputDir);
try {
Files.createDirectories(this.outputDir);
} catch (IOException e) {
throw new RuntimeException("Cannot create output directory", e);
}
}
public void save(ParsedPage page) {
try {
String filename = urlToFilename(page.url().toString()) + ".json";
Path file = outputDir.resolve(filename);
mapper.writeValue(file.toFile(), page);
} catch (IOException e) {
// Non-critical: log and continue
}
}
private String urlToFilename(String url) {
try {
byte[] digest = MessageDigest.getInstance("SHA-256")
.digest(url.getBytes());
StringBuilder sb = new StringBuilder();
for (byte b : digest) sb.append(String.format("%02x", b));
return sb.substring(0, 16); // first 16 chars of SHA-256
} catch (Exception e) {
return String.valueOf(url.hashCode());
}
}
}Running the Scraper
// Main.java
public class Main {
public static void main(String[] args) throws InterruptedException {
String seedUrl = args.length > 0 ? args[0] : "https://example.com";
int maxDepth = 3;
int concurrency = 50; // 50 concurrent requests
long politeness = 500; // 500ms between requests to same host
Scraper scraper = new Scraper(
URI.create(seedUrl).getHost(),
maxDepth, concurrency, politeness
);
scraper.run(seedUrl);
}
}mvn package -q
java -Xmx512m -jar target/scraper.jar https://example.com
# Expected output:
# Progress: 100 pages, 3 errors, 42.3/s rate, 847 queued
# Progress: 200 pages, 5 errors, 38.7/s rate, 1203 queued
# Crawl complete: 847 pages processed, 12 errorsFrequently Asked Questions
Q: Why use virtual threads instead of a fixed thread pool for the scraper?
Virtual threads (Java 21+) are lightweight—you can have tens of thousands waiting on I/O without exhausting heap or OS thread limits. A fixed thread pool of 50 threads can handle 50 concurrent requests, but each thread is a full OS thread consuming ~1MB of stack. With virtual threads, you can set concurrency = 500 with the same memory budget, dramatically increasing throughput on I/O-bound workloads like web scraping. Virtual threads park (cheaply suspend) while waiting for HTTP responses, freeing the carrier thread for other work.
Q: How do I handle JavaScript-rendered pages?
Java's HttpClient and Jsoup only parse static HTML. For JavaScript-rendered content, you need a headless browser. Options: (1) Playwright for Java (com.microsoft.playwright:playwright) can launch Chromium headlessly and return rendered HTML; (2) Selenium WebDriver with a headless Chrome instance; (3) Try the mobile or AMP version of the page, which is often static HTML. Headless browser scraping is 5–20× slower than plain HTTP scraping—use it selectively for specific targets, not as the default fetcher.
Q: What is the right politeness delay to use?
Check robots.txt for a Crawl-delay directive first—if present, respect it. Without a directive, 500ms–1000ms between requests to the same host is a reasonable default for production scrapers. More aggressive (100ms) may be acceptable for internal systems you control. Back off exponentially on 429 (Too Many Requests) responses. Never ignore 429—it means your scraper is causing measurable load on the target server and continued hammering is both impolite and counterproductive (the server may ban your IP).
Q: How do I scale this beyond a single JVM?
Replace the in-memory CrawlFrontier with a distributed queue (Redis with SETNX for deduplication, or Apache Kafka with a "visited" topic). Each scraper instance pulls URLs from the shared queue and publishes discovered links back. Use a centralized visited set in Redis (SADD is atomic, making it safe for concurrent workers). Store results in a shared database (PostgreSQL, Elasticsearch) instead of local files. With this architecture, you can run 10–100 scraper JVMs in parallel, each handling different URL batches.
