OpenClaw 新手教程

openclaw openclaw解答 1

什么是 OpenClaw?

OpenClaw 是一个开源的多功能网络爬虫/数据采集框架,专门设计用于网页数据抓取、API数据采集和自动化数据处理。

OpenClaw 新手教程-第1张图片-官方openclaw下载|openclaw官网-国内ai小龙虾下载

安装 OpenClaw

使用 pip 安装

pip install openclaw

从源码安装

git clone https://github.com/openclaw/openclaw.git
cd openclaw
pip install -e .

基础使用

简单网页抓取示例

from openclaw import Claw
# 创建爬虫实例
claw = Claw()
# 抓取网页
response = claw.fetch("https://example.com")
# 解析 HTML
parsed = claw.parse(response.content, parser='html')
# 提取数据= parsed.find('title').text
print(f"网页标题: {title}")

配置爬虫

from openclaw import Claw
from openclaw.config import CrawlConfig
# 创建配置
config = CrawlConfig(
    user_agent="MyCrawler/1.0",
    delay=2,  # 请求间隔(秒)
    timeout=30,
    retry_times=3,
    use_proxy=False
)
# 使用配置创建爬虫
claw = Claw(config=config)

抓取多个页面

import asyncio
from openclaw import AsyncClaw
async def fetch_multiple():
    # 创建异步爬虫
    claw = AsyncClaw()
    urls = [
        "https://example.com/page1",
        "https://example.com/page2",
        "https://example.com/page3"
    ]
    # 批量抓取
    results = await claw.fetch_all(urls)
    for url, response in zip(urls, results):
        if response:
            print(f"{url}: {len(response.content)} bytes")
# 运行异步函数
asyncio.run(fetch_multiple())

核心功能

数据提取器

from openclaw.extractors import CSSExtractor, XPathExtractor, RegexExtractor
# CSS 选择器提取
css_extractor = CSSExtractor()
data = css_extractor.extract(html_content, {: 'h1.title',
    'links': 'a.link@href',  # 获取 href 属性
    'content': 'div.content@text'
})
# XPath 提取
xpath_extractor = XPathExtractor()
data = xpath_extractor.extract(html_content, {: '//h1[@class="title"]/text()',
    'items': '//div[@class="item"]'
})
# 正则表达式提取
regex_extractor = RegexExtractor()
data = regex_extractor.extract(text_content, {
    'emails': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
    'phones': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'
})

数据管道

from openclaw.pipelines import CSVPipeline, JSONPipeline, DatabasePipeline
# CSV 管道
csv_pipeline = CSVPipeline(output_file='data.csv')
csv_pipeline.process_items(items)
# JSON 管道
json_pipeline = JSONPipeline(output_file='data.json')
json_pipeline.process_items(items)
# 数据库管道
db_pipeline = DatabasePipeline(
    db_type='sqlite',
    connection_string='data.db',
    table_name='items'
)
db_pipeline.process_items(items)

中间件

from openclaw.middleware import RotateUserAgent, DelayMiddleware, ProxyMiddleware
# 配置中间件链
claw = Claw()
claw.add_middleware(RotateUserAgent())  # 轮换 User-Agent
claw.add_middleware(DelayMiddleware(delay=1.5))  # 延迟请求
claw.add_middleware(ProxyMiddleware(proxy_list=['http://proxy1:8080', 'http://proxy2:8080']))

实战示例:抓取新闻网站

from openclaw import Claw
from openclaw.extractors import CSSExtractor
from openclaw.pipelines import JSONPipeline
class NewsCrawler:
    def __init__(self):
        self.claw = Claw()
        self.extractor = CSSExtractor()
        self.pipeline = JSONPipeline('news.json')
    def crawl_news(self, start_url):
        # 抓取列表页
        response = self.claw.fetch(start_url)
        # 提取新闻链接
        extract_rules = {
            'news_links': 'a.news-link@href'
        }
        data = self.extractor.extract(response.content, extract_rules)
        news_items = []
        for link in data['news_links']:
            # 抓取详细页
            news_response = self.claw.fetch(link)
            # 提取新闻内容
            news_rules = {
                'title': 'h1.news-title@text',
                'content': 'div.article-content@text',
                'publish_date': 'span.date@text',
                'author': 'span.author@text'
            }
            news_data = self.extractor.extract(news_response.content, news_rules)
            news_items.append(news_data)
        # 保存数据
        self.pipeline.process_items(news_items)
        print(f"成功抓取 {len(news_items)} 条新闻")
# 使用爬虫
crawler = NewsCrawler()
crawler.crawl_news('https://news.example.com/latest')

高级功能

分布式爬虫

from openclaw.distributed import RedisQueue, DistributedCrawler
# 配置分布式队列
queue = RedisQueue(
    host='localhost',
    port=6379,
    queue_name='crawl_queue'
)
# 创建分布式爬虫
dist_crawler = DistributedCrawler(
    queue=queue,
    worker_count=4  # 4个工作进程
)
# 添加任务
queue.put_task({
    'url': 'https://example.com',
    'callback': 'parse_function'
})

反爬虫绕过

from openclaw.anti_anti_crawl import JavaScriptRenderer, HeadlessBrowser
# 使用无头浏览器处理 JavaScript 渲染的页面
browser = HeadlessBrowser()
html = browser.get_page('https://dynamic-site.example.com')
# 或者使用 JavaScript 渲染器
renderer = JavaScriptRenderer()
rendered_html = renderer.render(url='https://dynamic-site.example.com')

监控和日志

from openclaw.monitor import CrawlMonitor
import logging
# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# 创建监控器
monitor = CrawlMonitor()
# 监控爬虫状态
@monitor.track
def crawl_task():
    # 爬虫任务
    pass
# 查看统计信息
stats = monitor.get_stats()
print(f"已抓取: {stats['pages_crawled']} 页")
print(f"成功率: {stats['success_rate']:.2%}")

最佳实践

  1. 遵守 robots.txt

    claw.respect_robots_txt = True
  2. 设置合理的延迟

    config = CrawlConfig(delay=2.0)  # 避免过快请求
  3. 错误处理

    try:
     response = claw.fetch(url)
    except Exception as e:
     print(f"抓取失败: {e}")
     # 记录失败,稍后重试
  4. 数据去重

    from openclaw.utils import BloomFilter

bf = BloomFilter(capacity=1000000) if not bf.contains(url): bf.add(url)

抓取这个URL


5. **资源管理**
```python
# 使用上下文管理器自动清理资源
with Claw() as claw:
    response = claw.fetch(url)
    # 处理响应
# 爬虫自动关闭

故障排除

常见问题

  1. 连接被拒绝

    • 检查网络连接
    • 尝试使用代理
    • 增加超时时间
  2. 被网站屏蔽

    • 轮换 User-Agent
    • 使用代理池
    • 降低请求频率
  3. 内存泄漏

    • 定期清理缓存
    • 使用分页处理大数据
    • 监控内存使用

调试模式

claw = Claw(debug=True)
# 将显示详细的请求信息

学习资源

下一步

  1. 尝试修改示例代码以适应你的需求
  2. 查看高级功能文档
  3. 参与社区讨论
  4. 贡献代码或报告问题

始终遵守目标网站的服务条款,尊重 robots.txt 文件,不要对网站造成过大负担。

标签: OpenClaw 新手教程

抱歉,评论功能暂时关闭!