Playwright Python Tutorial: Scraping Infinite Scroll Pages


Infinite scroll breaks traditional scraping. There’s no “next page” link to follow. Content loads dynamically as you scroll. Here’s how to handle it properly with Playwright.

The Problem

Sites like Twitter, Reddit, and most e-commerce product listings use infinite scroll. The HTML you get from requests.get() contains maybe 10-20 items. The rest load via JavaScript when the user scrolls.

# This won't work for infinite scroll pages
import requests
response = requests.get("https://example.com/products")
# You'll only get the initial items, not the full list

You need a real browser that executes JavaScript and triggers scroll events.

Basic Scroll-to-Bottom Pattern

The simplest approach: keep scrolling until no new content appears.

from playwright.sync_api import sync_playwright
import time

def scrape_infinite_scroll(url: str) -> list[str]:
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        page.goto(url)

        items = []
        previous_height = 0

        while True:
            # Scroll to bottom
            page.evaluate("window.scrollTo(0, document.body.scrollHeight)")

            # Wait for content to load
            time.sleep(2)

            # Check if we've reached the end
            current_height = page.evaluate("document.body.scrollHeight")
            if current_height == previous_height:
                break
            previous_height = current_height

            # Optional: collect items as we go
            new_items = page.query_selector_all(".item-selector")
            items = new_items

        # Extract data from all items
        results = [item.text_content() for item in items]
        browser.close()
        return results

Problem: time.sleep(2) is wasteful. You’re either waiting too long or not long enough.

Better: Wait for Network Idle

Instead of fixed delays, wait for the network to settle:

from playwright.sync_api import sync_playwright

def scrape_with_network_wait(url: str, item_selector: str) -> list[str]:
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        page.goto(url)

        previous_count = 0
        max_scrolls = 50  # Safety limit

        for _ in range(max_scrolls):
            # Scroll and wait for network
            page.evaluate("window.scrollTo(0, document.body.scrollHeight)")

            try:
                # Wait for network to be idle (no requests for 500ms)
                page.wait_for_load_state("networkidle", timeout=5000)
            except:
                pass  # Timeout is fine, just means nothing loaded

            # Check item count
            current_count = page.locator(item_selector).count()
            if current_count == previous_count:
                break
            previous_count = current_count

        # Extract all items
        items = page.locator(item_selector).all()
        results = [item.text_content() for item in items]

        browser.close()
        return results

Intercepting the API Directly

The cleanest approach: don’t scroll at all. Most infinite scroll implementations fetch data from an API. Intercept it.

from playwright.sync_api import sync_playwright
import json

def scrape_via_api_intercept(url: str) -> list[dict]:
    all_data = []

    def handle_response(response):
        # Adjust this condition for your target site
        if "/api/products" in response.url and response.status == 200:
            try:
                data = response.json()
                if "items" in data:
                    all_data.extend(data["items"])
            except:
                pass

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        # Listen for API responses
        page.on("response", handle_response)

        page.goto(url)

        # Scroll to trigger API calls
        for _ in range(20):
            page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            page.wait_for_timeout(1000)

        browser.close()

    return all_data

Why this is better: You get clean JSON instead of parsing HTML. The data structure is exactly what the frontend uses.

Handling Virtual Lists (React/Vue)

Modern frameworks use “virtualized” lists. Only visible items exist in the DOM. As you scroll, old items are destroyed and new ones are created.

from playwright.sync_api import sync_playwright

def scrape_virtual_list(url: str, item_selector: str, scroll_container: str) -> list[str]:
    """
    For virtualized lists where items are recycled.
    We collect data incrementally as we scroll.
    """
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        page.goto(url)
        page.wait_for_selector(item_selector)

        collected = set()  # Use set to avoid duplicates
        last_item_id = None
        stale_count = 0

        while stale_count < 3:
            # Get currently visible items
            items = page.locator(item_selector).all()

            for item in items:
                # Extract unique identifier (adjust selector as needed)
                item_id = item.get_attribute("data-id") or item.text_content()
                if item_id and item_id not in collected:
                    collected.add(item_id)

            # Scroll the container
            page.locator(scroll_container).evaluate(
                "el => el.scrollTop = el.scrollTop + el.clientHeight"
            )
            page.wait_for_timeout(500)

            # Check if we're making progress
            current_last = items[-1].get_attribute("data-id") if items else None
            if current_last == last_item_id:
                stale_count += 1
            else:
                stale_count = 0
                last_item_id = current_last

        browser.close()
        return list(collected)

Lazy-Loaded Images

Images often load separately from content. If you need image URLs:

from playwright.sync_api import sync_playwright

def get_lazy_images(url: str, img_selector: str) -> list[str]:
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        page.goto(url)

        image_urls = []

        # Scroll slowly to trigger lazy loading
        viewport_height = page.viewport_size["height"]
        page_height = page.evaluate("document.body.scrollHeight")

        current_position = 0
        while current_position < page_height:
            page.evaluate(f"window.scrollTo(0, {current_position})")
            page.wait_for_timeout(300)  # Brief pause for images to load

            current_position += viewport_height // 2
            page_height = page.evaluate("document.body.scrollHeight")

        # Collect all image sources
        images = page.locator(img_selector).all()
        for img in images:
            src = img.get_attribute("src") or img.get_attribute("data-src")
            if src and src not in image_urls:
                image_urls.append(src)

        browser.close()
        return image_urls

Common Pitfalls

1. Scrolling too fast

Sites detect automated scrolling. Add realistic delays:

import random

# Instead of fixed delays
page.wait_for_timeout(random.randint(800, 1500))

2. Missing the scroll container

Not all pages scroll on document.body. Find the actual scrollable element:

# Find the scrollable container (usually has overflow-y: scroll)
container = page.locator("[style*='overflow']").first
container.evaluate("el => el.scrollTop = el.scrollHeight")

3. Ignoring rate limits

Aggressive scrolling triggers rate limiting. Check for error states:

error_message = page.locator(".rate-limit-message")
if error_message.is_visible():
    page.wait_for_timeout(60000)  # Wait a minute

4. Memory leaks on long sessions

For very long scraping sessions, periodically restart the browser:

def scrape_large_site(urls: list[str], batch_size: int = 100):
    results = []

    for i in range(0, len(urls), batch_size):
        batch = urls[i:i + batch_size]
        batch_results = scrape_batch(batch)  # Fresh browser per batch
        results.extend(batch_results)

    return results

Async Version for Concurrency

For scraping multiple infinite scroll pages in parallel:

import asyncio
from playwright.async_api import async_playwright

async def scrape_page(url: str, browser) -> list[str]:
    page = await browser.new_page()
    await page.goto(url)

    previous_height = 0
    while True:
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        await page.wait_for_timeout(1000)

        current_height = await page.evaluate("document.body.scrollHeight")
        if current_height == previous_height:
            break
        previous_height = current_height

    items = await page.locator(".item").all()
    results = [await item.text_content() for item in items]
    await page.close()
    return results

async def scrape_multiple(urls: list[str], concurrency: int = 5) -> dict:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)

        semaphore = asyncio.Semaphore(concurrency)

        async def limited_scrape(url):
            async with semaphore:
                return await scrape_page(url, browser)

        tasks = [limited_scrape(url) for url in urls]
        results = await asyncio.gather(*tasks, return_exceptions=True)

        await browser.close()

    return dict(zip(urls, results))

# Usage
urls = ["https://example.com/page1", "https://example.com/page2"]
results = asyncio.run(scrape_multiple(urls))

Summary

PatternBest ForComplexity
Scroll-to-bottomSimple pagesLow
Network idle waitAPI-backed contentMedium
API interceptionClean data extractionMedium
Virtual list handlerReact/Vue appsHigh

Start with the simplest pattern that works. Only add complexity when you hit specific problems.