爬取单日数据
前期准备
我要爬取的网站是
1
https://www.icbc.com.cn/ICBC/%e9%87%91%e8%9e%8d%e4%bf%a1%e6%81%af/%e8%a1%8c%e6%83%85%e6%95%b0%e6%8d%ae/%e4%ba%ba%e6%b0%91%e5%b8%81%e5%8d%b3%e6%9c%9f%e5%a4%96%e6%b1%87%e7%89%8c%e4%bb%b7/
数据是这样的:币种、银行买入价、银行卖出价、发布时间

环境:python 3.12.2
寻找网络请求接口
在浏览器中按F12打开开发者工具,然后点击Network,勾选 Preserve log
-
选择 Fetch / XHR
XMLHttpRequest (XHR)和Fetch API都是网页向服务器发起 HTTP 请求的机制。Fetch API 是一种现代的、基于 Promise 的替代方案,用于取代老旧的、基于回调的 XHR,它提供了更简洁的语法和更适合现代网页开发的功能。 -
手动操作页面:
1 2 3
选择一个日期(比如 2021-01-25) 点击【查询】
-
Network 列表里 新出现的 POST 请求
可以确定工商银行新版 PAPI 的“历史外汇牌价接口”:POST https://papi.icbc.com.cn/exchanges/ns/history
如何使用接口
请求方式
-
POST
-
Content-Type: application/json
请求体
1
2
3
4
5
{
"date": "2021-01-13",
"currType": "",
"serverType": "1"
}
👉 这个接口不支持“时间区间”
👉 需要一天一天请求
编写代码
点击展开/折叠代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import requests
url = "https://papi.icbc.com.cn/exchanges/ns/history"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"Content-Type": "application/json",
"Origin": "https://www.icbc.com.cn",
"Referer": "https://www.icbc.com.cn/ICBC/%E9%87%91%E8%9E%8D%E4%BF%A1%E6%81%AF/%E8%A1%8C%E6%83%85%E6%95%B0%E6%8D%AE/%E4%BA%BA%E6%B0%91%E5%B8%81%E5%8D%B3%E6%9C%9F%E5%A4%96%E6%B1%87%E7%89%8C%E4%BB%B7/"
}
payload = {
"date": "2021-01-13",
"currType": "",
"serverType": "1"
}
r = requests.post(url, headers=headers, json=payload, timeout=10)
r.raise_for_status()
print(r.json())
报错:[SSL: UNSAFE_LEGACY_RENEGOTIATION_DISABLED]
- 当前的 Python + OpenSSL 与工行 PAPI 服务器的 TLS 协商不兼容
- 更准确地说: 请求触发了“不安全的旧式 TLS 重新协商”,而 Python 默认已经禁止它
为什么浏览器能访问,我的 Python 不能? 浏览器(Chrome / Edge):
1
2
3
4
5
自带 定制版 TLS 实现
对一些“历史遗留银行系统”做了兼容
会悄悄降级 / 特判
Python(requests / urllib3):
1
2
3
4
5
用的是 OpenSSL 官方实现
默认严格安全策略
直接拒绝这种 TLS 行为
👉 银行系统(尤其是国内)
👉 TLS 配置老 + 中间网关
👉 非常容易触发这个问题
解决方案
- 彻底禁用代理,哪怕不用代理工具,也很可能被环境变量污染。 ```python import os
os.environ[“HTTP_PROXY”] = “” os.environ[“HTTPS_PROXY”] = “” os.environ[“ALL_PROXY”] = “”
1
2
3
4
5
6
7
* requests 调用时显式指定代理
```python
proxies = {
"http": None,
"https": None
}
# 或者直接在代码里禁用
- SSL Adapter + verify=False
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# ===== 2. SSL Adapter:同时解决旧协议和证书验证冲突问题 =====
class LegacySSLAdapter(requests.adapters.HTTPAdapter):
def init_poolmanager(self, connections, maxsize, block=False):
# 创建 SSL 上下文
ctx = create_urllib3_context()
# 解决旧版 SSL 协商问题
ctx.options |= 0x4 # OP_LEGACY_SERVER_CONNECT
# 关闭证书验证(对应 verify=False)
ctx.check_hostname = False # 先关闭主机名检查
ctx.verify_mode = ssl.CERT_NONE # 再关闭证书验证
self.poolmanager = PoolManager(
num_pools=connections,
maxsize=maxsize,
block=block,
ssl_context=ctx
)
点击展开/折叠代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import requests
import ssl
from urllib3.poolmanager import PoolManager
from urllib3.util.ssl_ import create_urllib3_context
# ===== 1. 彻底禁用代理 =====
os.environ["HTTP_PROXY"] = ""
os.environ["HTTPS_PROXY"] = ""
os.environ["ALL_PROXY"] = ""
# ===== 2. SSL Adapter:同时解决旧协议和证书验证冲突问题 =====
class LegacySSLAdapter(requests.adapters.HTTPAdapter):
def init_poolmanager(self, connections, maxsize, block=False):
# 创建 SSL 上下文
ctx = create_urllib3_context()
# 解决旧版 SSL 协商问题
ctx.options |= 0x4 # OP_LEGACY_SERVER_CONNECT
# 关闭证书验证(对应 verify=False)
ctx.check_hostname = False # 先关闭主机名检查
ctx.verify_mode = ssl.CERT_NONE # 再关闭证书验证
self.poolmanager = PoolManager(
num_pools=connections,
maxsize=maxsize,
block=block,
ssl_context=ctx
)
url = "https://papi.icbc.com.cn/exchanges/ns/history"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"Content-Type": "application/json",
"Origin": "https://www.icbc.com.cn",
"Referer": "https://www.icbc.com.cn/ICBC/%E9%87%91%E8%9E%8D%E4%BF%A1%E6%81%AF/%E8%A1%8C%E6%83%85%E6%95%B0%E6%8D%AE/%E4%BA%BA%E6%B0%91%E5%B8%81%E5%8D%B3%E6%9C%9F%E5%A4%96%E6%B1%87%E7%89%8C%E4%BB%B7/"
}
payload = {
"date": "2021-01-13",
"currType": "",
"serverType": "1"
}
session = requests.Session()
session.mount("https://", LegacySSLAdapter())
proxies = {
"http": None,
"https": None
}
# 这里移除了 verify=False,因为已经在适配器里统一配置了
r = session.post(
url,
headers=headers,
json=payload,
timeout=15,
proxies=proxies
)
print(r.status_code)
print(r.text)
结果:

爬取近五年数据
点击展开/折叠代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
import time
import ssl
import requests
import pandas as pd
from datetime import datetime, timedelta
from urllib3.poolmanager import PoolManager
from urllib3.util.ssl_ import create_urllib3_context
# ===============================
# 1. 彻底禁用代理
# ===============================
os.environ["HTTP_PROXY"] = ""
os.environ["HTTPS_PROXY"] = ""
os.environ["ALL_PROXY"] = ""
# ===============================
# 2. SSL Adapter(银行专用)
# ===============================
class LegacySSLAdapter(requests.adapters.HTTPAdapter):
def init_poolmanager(self, connections, maxsize, block=False):
ctx = create_urllib3_context()
ctx.options |= 0x4 # OP_LEGACY_SERVER_CONNECT
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
self.poolmanager = PoolManager(
num_pools=connections,
maxsize=maxsize,
block=block,
ssl_context=ctx
)
# ===============================
# 3. 请求基础配置
# ===============================
URL = "https://papi.icbc.com.cn/exchanges/ns/history"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"Content-Type": "application/json",
"Origin": "https://www.icbc.com.cn",
"Referer": "https://www.icbc.com.cn/ICBC/"
}
PROXIES = {"http": None, "https": None}
# ===============================
# 4. 创建 Session
# ===============================
session = requests.Session()
session.mount("https://", LegacySSLAdapter())
# ===============================
# 5. 单日抓取函数(带重试)
# ===============================
def fetch_one_day(date_str, max_retry=3):
payload = {
"date": date_str,
"currType": "",
"serverType": "1"
}
for attempt in range(1, max_retry + 1):
try:
r = session.post(
URL,
headers=HEADERS,
json=payload,
timeout=15,
proxies=PROXIES
)
r.raise_for_status()
data = r.json()
if data.get("code") == 0:
return data["data"]
print(f"[WARN] {date_str} 返回异常 code: {data}")
return None
except Exception as e:
print(f"[RETRY {attempt}] {date_str} 失败: {e}")
time.sleep(2 * attempt)
return None
# ===============================
# 6. 时间范围(近 5 年)
# ===============================
end_date = datetime.today()
start_date = end_date - timedelta(days=5 * 365)
# ===============================
# 7. 主循环
# ===============================
all_rows = []
missing_dates = []
current = start_date
while current <= end_date:
date_str = current.strftime("%Y-%m-%d")
print(f"📅 抓取 {date_str}")
day_data = fetch_one_day(date_str)
if not day_data:
missing_dates.append(date_str)
else:
for item in day_data:
all_rows.append({
"date": date_str,
"currencyENName": item.get("currencyENName"),
"currencyCHName": item.get("currencyCHName"),
"reference": item.get("reference"),
"foreignBuy": item.get("foreignBuy"),
"foreignSell": item.get("foreignSell"),
"cashBuy": item.get("cashBuy"),
"cashSell": item.get("cashSell"),
})
time.sleep(0.5) # 银行接口,别太猛
current += timedelta(days=1)
session.close()
# ===============================
# 8. DataFrame & CSV
# ===============================
df = pd.DataFrame(all_rows)
csv_path = os.path.join(os.path.dirname(__file__), "icbc_exchange_5y.csv")
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print("\n✅ 数据抓取完成")
print(f"📄 CSV 文件:{csv_path}")
print(f"📊 总记录数:{len(df)}")
if missing_dates:
print(f"⚠️ 缺失日期 ({len(missing_dates)}):")
print(missing_dates)
else:
print("🎉 无缺失日期")