python 爬取静态页面

2025-05-25

“爬虫”（Web Crawler）这个名字来源于英语中的 “crawl”，意思是“爬行”。比喻网络爬虫像蜘蛛一样在互联网上“爬行”，逐个网页扒取数据，也被称为 “Spider（蜘蛛）”。
上世纪 90 年代初，互联网刚刚兴起，网页数量飞速增长，为了建立搜索引擎，必须自动抓取网页内容并进行索引。网络爬虫程序最早诞生于 1993 年。早期的网络爬虫主要用于收集网页统计信息和构建搜索引擎索引。随着 WEB 的发展，现代爬虫可以执行网页的可访问性和安全性检查、模拟用户交互等功能

最开始我是用爬虫来抓取静态数据页面，所以用 Python 写了个小脚本。
实现：通过入口 url 扒取特定 HTML 内容并保存在本地，提取页面内 URL 作为下一次抓取的 URL。适用：简单的 blog 系列

import time
import random
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

headers = {
    "User-Agent": "headers" # 配置 headers，模拟真实访问
}

target_class = "content_class"

def crawl(url, visited=set(), max_pages=100):
    count = 0
    while url and count < max_pages:
        time.sleep(random.uniform(2, 5)) # 防止访问过于频繁，被网站 block
        print('-------start-------')
        if url in visited:
            break
        print(f"正在爬取：{url}")
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == xxx:
                print("被限流，等待重试...")
                time.sleep(random.uniform(5, 10)) # 暂停一段时间
                continue
            response.raise_for_status()
        except Exception as e:
            print(f"请求失败：{e}")
            continue

        # 提取页面内容
        soup = BeautifulSoup(response.text, "html.parser")

        # 提取 h1 作为文件名
        h1 = soup.find("h1")
        title = h1.get_text(strip=True) if h1 else f"page_{count}"
        filename = title + ".html"

        #提取目标 class 内容
        target = soup.find(class_=target_class)
        if target:
            # 图片处理（特殊资源都应进行这种处理）
            for img in target.find_all("img"):
                src = img.get("src")
                if src:
                    img["src"] = urljoin(url, src)
            # 写入本地文件
            with open(filename, "w", encoding="utf-8") as f:
                f.write(f"<html><head><meta charset='utf-8'><title>{title}</title></head><body>")
                f.write(str(target))
                f.write("</body></html>")
            print(f"保存成功：{filename}")
        else:
            print(f"未找到 class='{target_class}' 的内容")

        visited.add(url)

        # 获取所有 a 标签，作为下一次拉取的 url
        for next_link in soup.find_all("a", href=True, class_='menu-item'):
            if next_link:
                url = urljoin(url, next_link.get("href"))
                if url in visited:
                    continue
                else:
                    count += 1
                    break;
            else:
                print("未找到下一页链接，结束", h1)
                break
    
        print('-------end-------')
    

# 示例起始页
start_url = "initial.url"
crawl(start_url)

运行环境是 python 版本小于 python3.12。
这个脚本只适用于静态页面，对于 JS 动态生成，会获取不到，可以使用支持 JS 渲染的工具，如：

selenium。模拟浏览器，任何复杂网页，稳定但稍慢
Playwright。现代自动化框架，更快、稳定
requests-html。轻量但功能有限，依赖较多

实现：使用 playwright 模拟浏览器访问，

from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

start_url="initial_url",  # 初始 URL
content_class="content_class",   # 内容所在的 class 名
next_div_id="next",   # 下一页按钮所在 div 的 id

def crawl_pages(max_pages=5):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)  # 设置为 False 可调试
        page = browser.new_page(user_agent=(
            "headers",
            "headers"
        ))

        url = start_url
        for i in range(max_pages):
            page.goto(url)
            page.wait_for_timeout(3000)  # 等待 3 秒渲染

            html = page.content()
            soup = BeautifulSoup(html, "html.parser")

            # 提取 h1 用作文件名
            h1 = soup.find("h1")
            title = h1.get_text(strip=True) if h1 else f"page_{i+1}"
            filename = f"{title}.html".replace("/", "_")  # 避免非法文件名

            # 提取目标内容（根据 class）
            content = soup.find(class_=content_class)
            if content:
                with open(filename, "w", encoding="utf-8") as f:
                    f.write(str(content))
                print(f"保存：{filename}")
            else:
                print(f"第 {i+1} 页未找到内容")

            # 查找下一页链接
            next_div = soup.find("div", id=next_div_id)
            next_link = next_div.find("a") if next_div else None
            if next_link and next_link.get("href"):
                url = urljoin(url, next_link["href"])
                time.sleep(2)
            else:
                print("无下一页，停止爬取")
                break

        browser.close()

crawl_pages(max_pages=1)

注意，如果爬虫请求太频繁，会被服务器限流或临时封禁。
选择应对的方式有：

降低请求频率，每次请求之间增加延迟（随机等待更好），模拟人类行为，即 time.sleep(num)
添加 headers（特别是 User-Agent）
使用代理 IP。这个是被封禁 IP 下的措施