Quick verdict: C web scraping pairs libcurl for HTTP and libxml2 for HTML parsing — both are battle-tested, ship in every Linux distro, and have stable ABIs going back 20 years. You write 10x more code than Python, but you get 5–10x throughput, 50–100x lower memory per request, and the ability to embed scraping in firmware, edge devices, or hot paths inside other C/C++ programs. In 2026, choose C when you're parsing millions of pages, running on embedded hardware, or already have a C/C++ codebase you need to extend.
| Scenario | Use C |
|---|---|
| 1M+ pages/day on a single box | Yes — CPU + memory savings compound |
| Embedded / IoT / OpenWrt | Yes — Python is too heavy |
| Hot path inside existing C/C++ app | Yes |
| Quick prototype, <10k pages | No — use Python |
| Heavy JS rendering required | No — you need Playwright |
| Team is Python-only | No — maintenance cost dominates |
# Ubuntu / Debian
sudo apt install libcurl4-openssl-dev libxml2-dev pkg-config build-essential
# macOS (Homebrew)
brew install curl libxml2 pkg-config
# Build flags (pkg-config gives correct -I / -L for your distro)
pkg-config --cflags --libs libcurl libxml-2.0
# typical: -I/usr/include/libxml2 -lcurl -lxml2
// scrape.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
typedef struct {
char *data;
size_t len;
} buffer_t;
static size_t write_cb(void *ptr, size_t size, size_t nmemb, void *userdata) {
size_t total = size * nmemb;
buffer_t *buf = (buffer_t *)userdata;
char *p = realloc(buf->data, buf->len + total + 1);
if (!p) return 0;
buf->data = p;
memcpy(buf->data + buf->len, ptr, total);
buf->len += total;
buf->data[buf->len] = 0;
return total;
}
int main(int argc, char **argv) {
if (argc < 2) { fprintf(stderr, "usage: %s URL\n", argv[0]); return 1; }
curl_global_init(CURL_GLOBAL_DEFAULT);
CURL *curl = curl_easy_init();
buffer_t buf = {0};
curl_easy_setopt(curl, CURLOPT_URL, argv[1]);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (C scraper/1.0)");
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 20L);
CURLcode rc = curl_easy_perform(curl);
long status = 0;
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &status);
printf("HTTP %ld, %zu bytes\n", status, buf.len);
if (rc == CURLE_OK) fwrite(buf.data, 1, buf.len > 200 ? 200 : buf.len, stdout);
free(buf.data);
curl_easy_cleanup(curl);
curl_global_cleanup();
return rc == CURLE_OK ? 0 : 1;
}
gcc scrape.c -o scrape $(pkg-config --cflags --libs libcurl)
./scrape https://example.com
struct curl_slist *headers = NULL;
headers = curl_slist_append(headers, "Accept: text/html");
headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.9");
headers = curl_slist_append(headers, "Referer: https://www.google.com/");
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
// after curl_easy_perform:
curl_slist_free_all(headers);
const char *body = "username=alice&password=secret";
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body);
curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, (long)strlen(body));
// For JSON:
struct curl_slist *json_headers = NULL;
json_headers = curl_slist_append(json_headers, "Content-Type: application/json");
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, json_headers);
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, "{\"q\":\"search term\"}");
curl_easy_setopt(curl, CURLOPT_PROXY, "http://gw.spyderproxy.com:8000");
curl_easy_setopt(curl, CURLOPT_PROXYUSERPWD, "USER:PASS");
curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
// For SOCKS5:
// curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME);
For volume scraping rotate IPs by changing CURLOPT_PROXY per request. The Premium Residential gateway ($2.75/GB) handles rotation server-side; each request goes through a fresh IP automatically when you use the gateway endpoint.
curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "cookies.txt"); // read
curl_easy_setopt(curl, CURLOPT_COOKIEJAR, "cookies.txt"); // write
// Both at once: libcurl will load on init and save on cleanup.
libxml2 has an HTML parser that tolerates malformed markup. Use htmlReadMemory + XPath:
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <libxml/tree.h>
void extract_links(const char *html, size_t len) {
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, "UTF-8",
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
if (!doc) return;
xmlXPathContextPtr ctx = xmlXPathNewContext(doc);
xmlXPathObjectPtr res = xmlXPathEvalExpression(
(const xmlChar *)"//a[@href]", ctx);
if (res && res->nodesetval) {
for (int i = 0; i < res->nodesetval->nodeNr; i++) {
xmlNodePtr n = res->nodesetval->nodeTab[i];
xmlChar *href = xmlGetProp(n, (const xmlChar *)"href");
xmlChar *text = xmlNodeGetContent(n);
printf("%s\t%s\n", href, text);
xmlFree(href);
xmlFree(text);
}
}
xmlXPathFreeObject(res);
xmlXPathFreeContext(ctx);
xmlFreeDoc(doc);
}
Build flags need libxml-2.0:
gcc scrape.c -o scrape $(pkg-config --cflags --libs libcurl libxml-2.0)
| What you want | XPath |
|---|---|
| All product titles in <h2 class="title"> | //h2[@class='title']/text() |
| Prices in any element with class containing "price" | //*[contains(@class,'price')] |
| Next-page link | //a[contains(text(),'Next')]/@href |
| JSON-LD blocks | //script[@type='application/ld+json']/text() |
| Meta description | //meta[@name='description']/@content |
libcurl's "multi" interface runs many transfers concurrently on a single thread — far more efficient than thread-per-request:
CURLM *multi = curl_multi_init();
CURL *handles[N];
buffer_t bufs[N] = {0};
for (int i = 0; i < N; i++) {
handles[i] = curl_easy_init();
curl_easy_setopt(handles[i], CURLOPT_URL, urls[i]);
curl_easy_setopt(handles[i], CURLOPT_WRITEFUNCTION, write_cb);
curl_easy_setopt(handles[i], CURLOPT_WRITEDATA, &bufs[i]);
curl_easy_setopt(handles[i], CURLOPT_PROXY, proxies[i % nproxies]);
curl_multi_add_handle(multi, handles[i]);
}
int still_running;
do {
curl_multi_perform(multi, &still_running);
curl_multi_poll(multi, NULL, 0, 1000, NULL);
} while (still_running);
for (int i = 0; i < N; i++) {
long status = 0;
curl_easy_getinfo(handles[i], CURLINFO_RESPONSE_CODE, &status);
printf("%s -> %ld (%zu bytes)\n", urls[i], status, bufs[i].len);
free(bufs[i].data);
curl_multi_remove_handle(multi, handles[i]);
curl_easy_cleanup(handles[i]);
}
curl_multi_cleanup(multi);
With curl_multi + a residential pool, a single C process comfortably hits 500–1,000 RPS on a small VPS. Beyond that you'll be bottlenecked by the proxy gateway, not C.
realloc for the response buffer must check for NULL return — out-of-memory under load is real.curl_slist_append needs a matching curl_slist_free_all.xmlChar * from xmlGetProp / xmlNodeGetContent needs xmlFree.htmlDocPtr needs xmlFreeDoc.valgrind --leak-check=full before shipping. Long-running scrapers leak slowly until they OOM.Some anti-bot systems fingerprint TLS at the handshake (JA3 / JA4). libcurl uses your system's OpenSSL / GnuTLS / NSS, which usually produces a recognizable curl fingerprint. Workarounds:
libcurl-impersonate (the curl_cffi backend) for Chrome/Firefox TLS fingerprints.CURLOPT_SSL_CIPHER_LIST to match a browser cipher order.| Language | Pages/sec, 1 process, 10k URLs | RAM peak |
|---|---|---|
| Python requests, sync loop | ~25 | ~60 MB |
| Python httpx async, 50 workers | ~280 | ~120 MB |
| Node.js fetch, 100 workers | ~310 | ~180 MB |
| Go net/http, 200 goroutines | ~520 | ~80 MB |
| C + libcurl multi, 100 handles | ~640 | ~25 MB |
Numbers from a 4-core c6i.xlarge fetching cached pages over a residential gateway. C wins on RAM by 5x, on throughput by ~20%. If your bottleneck is the upstream proxy (it usually is at this scale), the throughput gap closes; the RAM gap stays.
Related: Web scraping with PHP · Concurrency vs parallelism · Rotating proxies with Python.