Scrapy meta 参数完全指南
核心概念
meta 是 Request 对象中的字典,用于在请求间传递数据和配置。它就像一个"信息包裹",跟随请求在整个爬虫流程中流动。常用 meta 键分类
🌐 网络请求控制
键 类型 说明 示例 proxystr 指定代理服务器 http://proxy.example.com:8080download_timeoutint/float 下载超时时间(秒) 10dont_redirectbool 禁止重定向 Truedont_retrybool 禁止重试 Truedont_obey_robotstxtbool 忽略 robots.txt Truedownload_slotstr 控制并发槽 "slow_site"🍪 Cookie 和认证
键 类型 说明 dont_merge_cookiesbool 不合并 Cookie cookiejarCookieJar 指定 Cookie jar 📦 自定义数据传递
键 类型 说明 custom_datadict 自定义数据 itemItem/dict 数据对象 user_dataany 用户数据 ⚙️ 其他配置
键 类型 说明 dont_cachebool 禁用缓存 priorityint 请求优先级 errbackcallable 错误回调 代码例子
例子 1: 基础 meta 使用
import scrapy
class MySpider(scrapy.Spider):
name = 'example'
start_urls = ['http://example.com']
def start_requests(self):
for page in range(1, 4):
yield scrapy.Request(
url=f'http://example.com/page/{page}',
meta={
'page_num': page,
'download_timeout': 10
},
callback=self.parse
)
def parse(self, response):
page_num = response.meta['page_num']
print(f'正在爬取第 {page_num} 页')
# 处理响应...例子 2: 使用代理和自定义数据
import scrapy
class ProxySpider(scrapy.Spider):
name = 'proxy_example'
def start_requests(self):
urls = ['http://example.com/item/1', 'http://example.com/item/2']
for url in urls:
yield scrapy.Request(
url=url,
meta={
'proxy': 'http://proxy.example.com:8080',
'item_id': url.split('/')[-1],
'source': 'example.com',
'download_timeout': 15
},
callback=self.parse
)
def parse(self, response):
item_id = response.meta['item_id']
source = response.meta['source']
data = {
'id': item_id,
'source': source,
'title': response.css('h1::text').get(),
'content': response.css('.content::text').getall()
}
yield data例子 3: 跨回调函数传递数据
import scrapy
class ListDetailSpider(scrapy.Spider):
name = 'list_detail'
start_urls = ['http://example.com/list']
def parse(self, response):
"""解析列表页"""
for item_url in response.css('a.item::attr(href)').getall():
yield scrapy.Request(
url=response.urljoin(item_url),
meta={
'category': response.css('.category::text').get(),
'list_page': response.url,
'depth': response.meta.get('depth', 0) + 1
},
callback=self.parse_detail
)
def parse_detail(self, response):
"""解析详情页"""
yield {
'title': response.css('h1::text').get(),
'category': response.meta['category'],
'source_list': response.meta['list_page'],
'depth': response.meta['depth'],
'url': response.url
}例子 4: 处理错误和重试
import scrapy
from scrapy.http import Request
class ErrorHandlerSpider(scrapy.Spider):
name = 'error_handler'
def start_requests(self):
urls = ['http://example.com/page1', 'http://example.com/page2']
for url in urls:
yield scrapy.Request(
url=url,
meta={
'retry_count': 0,
'max_retries': 3,
'dont_obey_robotstxt': False
},
callback=self.parse,
errback=self.errback
)
def parse(self, response):
if response.status == 200:
yield {
'url': response.url,
'status': response.status,
'data': response.css('body::text').get()
}
def errback(self, failure):
"""错误回调"""
request = failure.request
retry_count = request.meta['retry_count']
max_retries = request.meta['max_retries']
if retry_count < max_retries:
self.logger.warning(f'重试 {request.url} (尝试 {retry_count + 1})')
new_request = request.copy()
new_request.meta['retry_count'] = retry_count + 1
yield new_request
else:
self.logger.error(f'放弃 {request.url} - 重试次数超限')例子 5: Item 对象传递
import scrapy
from scrapy import Item, Field
class ProductItem(Item):
name = Field()
price = Field()
category = Field()
url = Field()
class ItemPassSpider(scrapy.Spider):
name = 'item_pass'
start_urls = ['http://example.com/products']
def parse(self, response):
for product_url in response.css('a.product::attr(href)').getall():
item = ProductItem()
item['category'] = response.css('.cat-name::text').get()
item['url'] = response.urljoin(product_url)
yield scrapy.Request(
url=item['url'],
meta={'item': item},
callback=self.parse_product
)
def parse_product(self, response):
item = response.meta['item']
item['name'] = response.css('h1::text').get()
item['price'] = response.css('.price::text').get()
yield item例子 6: 控制并发和优先级
import scrapy
class ConcurrencySpider(scrapy.Spider):
name = 'concurrency'
def start_requests(self):
# 快速页面
for i in range(1, 6):
yield scrapy.Request(
url=f'http://fast-site.com/page/{i}',
meta={
'download_slot': 'fast',
'priority': 10
},
callback=self.parse
)
# 慢速页面
for i in range(1, 4):
yield scrapy.Request(
url=f'http://slow-site.com/page/{i}',
meta={
'download_slot': 'slow',
'download_timeout': 20,
'priority': 5
},
callback=self.parse
)
def parse(self, response):
yield {
'url': response.url,
'data': response.css('body::text').get()[:100]
}最佳实践
# 推荐做法
item_id = response.meta.get('item_id', 'unknown')
# 不推荐
item_id = response.meta['item_id'] # 可能报错总结
meta 是 Scrapy 中强大的数据传递机制,可以: