Playwright Python Tutorial: Scraping Infinite Scroll Pages
Infinite scroll breaks traditional scraping. There’s no “next page” link to follow. Content loads dynamically as you scroll. Here’s how to handle it properly with Playwright.
The Problem
Sites like Twitter, Reddit, and most e-commerce product listings use infinite scroll. The HTML you get from requests.get() contains maybe 10-20 items. The rest load via JavaScript when the user scrolls.
# This won't work for infinite scroll pages
import requests
response = requests.get("https://example.com/products")
# You'll only get the initial items, not the full list
You need a real browser that executes JavaScript and triggers scroll events.
Basic Scroll-to-Bottom Pattern
The simplest approach: keep scrolling until no new content appears.
from playwright.sync_api import sync_playwright
import time
def scrape_infinite_scroll(url: str) -> list[str]:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url)
items = []
previous_height = 0
while True:
# Scroll to bottom
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
# Wait for content to load
time.sleep(2)
# Check if we've reached the end
current_height = page.evaluate("document.body.scrollHeight")
if current_height == previous_height:
break
previous_height = current_height
# Optional: collect items as we go
new_items = page.query_selector_all(".item-selector")
items = new_items
# Extract data from all items
results = [item.text_content() for item in items]
browser.close()
return results
Problem: time.sleep(2) is wasteful. You’re either waiting too long or not long enough.
Better: Wait for Network Idle
Instead of fixed delays, wait for the network to settle:
from playwright.sync_api import sync_playwright
def scrape_with_network_wait(url: str, item_selector: str) -> list[str]:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url)
previous_count = 0
max_scrolls = 50 # Safety limit
for _ in range(max_scrolls):
# Scroll and wait for network
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
try:
# Wait for network to be idle (no requests for 500ms)
page.wait_for_load_state("networkidle", timeout=5000)
except:
pass # Timeout is fine, just means nothing loaded
# Check item count
current_count = page.locator(item_selector).count()
if current_count == previous_count:
break
previous_count = current_count
# Extract all items
items = page.locator(item_selector).all()
results = [item.text_content() for item in items]
browser.close()
return results
Intercepting the API Directly
The cleanest approach: don’t scroll at all. Most infinite scroll implementations fetch data from an API. Intercept it.
from playwright.sync_api import sync_playwright
import json
def scrape_via_api_intercept(url: str) -> list[dict]:
all_data = []
def handle_response(response):
# Adjust this condition for your target site
if "/api/products" in response.url and response.status == 200:
try:
data = response.json()
if "items" in data:
all_data.extend(data["items"])
except:
pass
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Listen for API responses
page.on("response", handle_response)
page.goto(url)
# Scroll to trigger API calls
for _ in range(20):
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(1000)
browser.close()
return all_data
Why this is better: You get clean JSON instead of parsing HTML. The data structure is exactly what the frontend uses.
Handling Virtual Lists (React/Vue)
Modern frameworks use “virtualized” lists. Only visible items exist in the DOM. As you scroll, old items are destroyed and new ones are created.
from playwright.sync_api import sync_playwright
def scrape_virtual_list(url: str, item_selector: str, scroll_container: str) -> list[str]:
"""
For virtualized lists where items are recycled.
We collect data incrementally as we scroll.
"""
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url)
page.wait_for_selector(item_selector)
collected = set() # Use set to avoid duplicates
last_item_id = None
stale_count = 0
while stale_count < 3:
# Get currently visible items
items = page.locator(item_selector).all()
for item in items:
# Extract unique identifier (adjust selector as needed)
item_id = item.get_attribute("data-id") or item.text_content()
if item_id and item_id not in collected:
collected.add(item_id)
# Scroll the container
page.locator(scroll_container).evaluate(
"el => el.scrollTop = el.scrollTop + el.clientHeight"
)
page.wait_for_timeout(500)
# Check if we're making progress
current_last = items[-1].get_attribute("data-id") if items else None
if current_last == last_item_id:
stale_count += 1
else:
stale_count = 0
last_item_id = current_last
browser.close()
return list(collected)
Lazy-Loaded Images
Images often load separately from content. If you need image URLs:
from playwright.sync_api import sync_playwright
def get_lazy_images(url: str, img_selector: str) -> list[str]:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url)
image_urls = []
# Scroll slowly to trigger lazy loading
viewport_height = page.viewport_size["height"]
page_height = page.evaluate("document.body.scrollHeight")
current_position = 0
while current_position < page_height:
page.evaluate(f"window.scrollTo(0, {current_position})")
page.wait_for_timeout(300) # Brief pause for images to load
current_position += viewport_height // 2
page_height = page.evaluate("document.body.scrollHeight")
# Collect all image sources
images = page.locator(img_selector).all()
for img in images:
src = img.get_attribute("src") or img.get_attribute("data-src")
if src and src not in image_urls:
image_urls.append(src)
browser.close()
return image_urls
Common Pitfalls
1. Scrolling too fast
Sites detect automated scrolling. Add realistic delays:
import random
# Instead of fixed delays
page.wait_for_timeout(random.randint(800, 1500))
2. Missing the scroll container
Not all pages scroll on document.body. Find the actual scrollable element:
# Find the scrollable container (usually has overflow-y: scroll)
container = page.locator("[style*='overflow']").first
container.evaluate("el => el.scrollTop = el.scrollHeight")
3. Ignoring rate limits
Aggressive scrolling triggers rate limiting. Check for error states:
error_message = page.locator(".rate-limit-message")
if error_message.is_visible():
page.wait_for_timeout(60000) # Wait a minute
4. Memory leaks on long sessions
For very long scraping sessions, periodically restart the browser:
def scrape_large_site(urls: list[str], batch_size: int = 100):
results = []
for i in range(0, len(urls), batch_size):
batch = urls[i:i + batch_size]
batch_results = scrape_batch(batch) # Fresh browser per batch
results.extend(batch_results)
return results
Async Version for Concurrency
For scraping multiple infinite scroll pages in parallel:
import asyncio
from playwright.async_api import async_playwright
async def scrape_page(url: str, browser) -> list[str]:
page = await browser.new_page()
await page.goto(url)
previous_height = 0
while True:
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(1000)
current_height = await page.evaluate("document.body.scrollHeight")
if current_height == previous_height:
break
previous_height = current_height
items = await page.locator(".item").all()
results = [await item.text_content() for item in items]
await page.close()
return results
async def scrape_multiple(urls: list[str], concurrency: int = 5) -> dict:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
semaphore = asyncio.Semaphore(concurrency)
async def limited_scrape(url):
async with semaphore:
return await scrape_page(url, browser)
tasks = [limited_scrape(url) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
await browser.close()
return dict(zip(urls, results))
# Usage
urls = ["https://example.com/page1", "https://example.com/page2"]
results = asyncio.run(scrape_multiple(urls))
Summary
| Pattern | Best For | Complexity |
|---|---|---|
| Scroll-to-bottom | Simple pages | Low |
| Network idle wait | API-backed content | Medium |
| API interception | Clean data extraction | Medium |
| Virtual list handler | React/Vue apps | High |
Start with the simplest pattern that works. Only add complexity when you hit specific problems.