Quick answer: For most static sites — requests + BeautifulSoup to grab the URLs from <img> tags, then a worker pool to download. For lazy-loaded galleries (Instagram-style infinite scroll, React/Vue image grids), use Playwright with explicit scroll-and-wait. For volume (100k+ images), switch to async httpx with a rotating proxy pool. Always prefer the srcset highest-resolution candidate over src, dedup by SHA-256 of bytes, and respect robots.txt + copyright.
pip install requests beautifulsoup4 lxml httpx playwright pillow
playwright install chromium
The simplest case — image URLs are in the HTML as <img src="...">.
import os
import hashlib
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; ImageBot/1.0; +https://example.com/bot)"
}
def best_url(img, page_url):
'''Pick highest-res candidate from srcset, fall back to src.'''
srcset = img.get("srcset") or img.get("data-srcset")
if srcset:
candidates = []
for part in srcset.split(","):
tokens = part.strip().split()
if len(tokens) >= 2 and tokens[-1].endswith("w"):
candidates.append((int(tokens[-1][:-1]), tokens[0]))
if candidates:
return urljoin(page_url, max(candidates)[1])
for attr in ("src", "data-src", "data-original", "data-lazy"):
if img.get(attr):
return urljoin(page_url, img[attr])
return None
def scrape_images(page_url, out_dir="images"):
os.makedirs(out_dir, exist_ok=True)
r = requests.get(page_url, headers=HEADERS, timeout=20)
r.raise_for_status()
soup = BeautifulSoup(r.text, "lxml")
seen = set()
for img in soup.find_all("img"):
url = best_url(img, page_url)
if not url or url.startswith("data:"):
continue
alt = img.get("alt", "")
try:
ir = requests.get(url, headers=HEADERS, timeout=20)
ir.raise_for_status()
except Exception as e:
print(f" skip {url}: {e}")
continue
digest = hashlib.sha256(ir.content).hexdigest()[:16]
if digest in seen:
continue
seen.add(digest)
ext = os.path.splitext(urlparse(url).path)[1] or ".jpg"
fname = f"{digest}{ext}"
with open(os.path.join(out_dir, fname), "wb") as f:
f.write(ir.content)
print(f" saved {fname} alt={alt!r}")
scrape_images("https://example.com/gallery")
For 1,000+ images you want concurrent downloads — a sync loop wastes 95% of the time waiting on I/O.
import asyncio
import hashlib
import os
import httpx
from bs4 import BeautifulSoup
PROXY = "http://USER:[email protected]:8000"
CONCURRENCY = 20
async def fetch_page(client, url):
r = await client.get(url, timeout=20)
r.raise_for_status()
return r.text
async def download(client, url, out_dir, sem, seen):
async with sem:
try:
r = await client.get(url, timeout=20)
r.raise_for_status()
except Exception as e:
return None
digest = hashlib.sha256(r.content).hexdigest()[:16]
if digest in seen:
return None
seen.add(digest)
ext = os.path.splitext(url.split("?")[0])[1] or ".jpg"
path = os.path.join(out_dir, f"{digest}{ext}")
with open(path, "wb") as f:
f.write(r.content)
return path
async def main(page_url, out_dir="images"):
os.makedirs(out_dir, exist_ok=True)
async with httpx.AsyncClient(proxy=PROXY, http2=True,
headers={"User-Agent": "ImageBot/1.0"}) as client:
html = await fetch_page(client, page_url)
soup = BeautifulSoup(html, "lxml")
urls = [img.get("src") for img in soup.find_all("img") if img.get("src")]
sem = asyncio.Semaphore(CONCURRENCY)
seen = set()
tasks = [download(client, u, out_dir, sem, seen) for u in urls]
results = await asyncio.gather(*tasks)
print(f"saved {sum(1 for r in results if r)} / {len(urls)} images")
asyncio.run(main("https://example.com/gallery"))
On a Premium Residential pool, 20 concurrent workers comfortably hit 50–100 images/sec without tripping rate limits. Push higher (50–100 concurrent) only on residential, never on a single datacenter IP.
Modern sites defer image loading via IntersectionObserver or React virtualized lists. The HTML you fetch with requests has placeholders; the real src only appears after scroll. Use Playwright to drive a real browser.
import asyncio
from playwright.async_api import async_playwright
async def scrape_lazy(url):
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
proxy={"server": "http://gw.spyderproxy.com:8000",
"username": "USER", "password": "PASS"},
)
page = await browser.new_page()
await page.goto(url, wait_until="domcontentloaded")
# Scroll to bottom in steps so IntersectionObserver fires for each row
prev_height = 0
for _ in range(30):
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(800)
h = await page.evaluate("document.body.scrollHeight")
if h == prev_height:
break
prev_height = h
urls = await page.evaluate(() => Array.from(document.querySelectorAll('img'))
.map(i => i.currentSrc || i.src)
.filter(s => s && !s.startsWith('data:')))
await browser.close()
return urls
urls = asyncio.run(scrape_lazy("https://example.com/feed"))
print(f"found {len(urls)} images")
Key Playwright tips: use img.currentSrc (not img.src) to get the resolution the browser actually picked from srcset, and scroll in steps with a wait between — one big scroll skips intermediate observer callbacks.
The srcset attribute lets a site offer multiple resolutions. Cheap scrapers grab the small src placeholder and end up with thumbnails. Parse srcset and pick the highest-width candidate.
def parse_srcset(srcset):
'''Return list of (url, descriptor_value, descriptor_type).'''
out = []
for part in srcset.split(","):
tokens = part.strip().split()
if not tokens:
continue
url = tokens[0]
if len(tokens) == 1:
out.append((url, 1.0, "x"))
else:
d = tokens[-1]
if d.endswith("w"):
out.append((url, float(d[:-1]), "w"))
elif d.endswith("x"):
out.append((url, float(d[:-1]), "x"))
return out
For training image-language models you want the alt attribute and surrounding caption. Caption text is often in a sibling <figcaption> or a parent <figure>:
def get_caption(img):
fig = img.find_parent("figure")
if fig:
cap = fig.find("figcaption")
if cap:
return cap.get_text(strip=True)
return img.get("alt", "")
SHA-256 of the bytes catches exact duplicates. For near-duplicates (resized, recompressed) use perceptual hashing:
from PIL import Image
import imagehash
def phash(path):
return str(imagehash.phash(Image.open(path)))
# Group by phash; near-duplicates share a prefix.
Image hosts (Imgur, Cloudinary, Akamai CDNs) rate-limit aggressively by IP because their bandwidth bill scales with hot-linkers. Pull 5,000 images from one IP in 10 minutes and you're looking at HTTP 429 or 403. Rotating residential proxies solve this by spreading requests across thousands of consumer IPs.
noimageai meta tag.piexif if redistributing.Referer. Pass it: headers={"Referer": page_url}.data-src.Content-Type; reject anything that isn't image/*.Related: Scraping a site that needs login · Scrape text from a website · Python asyncio tutorial · Avoiding scraper detection.