洛谷题解爬虫教程
csxx601cjy · · 科技·工程
:::epigraph[——我] 找不到题解写的蒟蒻有福了! :::
功能
爬取洛谷题目的题解页面,识别出哪些题目可以写题解。
就类似于洛谷题解站,但是题解站挂了。
步骤
确保你有 python3 及以上的版本
cmd 中使用 python --version 查看你的 python 版本。
如果没有,打开这个链接按照提示安装。
确保你有 pip
如果你的 python 是 3.4 以上版本,是自带 pip 的。
cmd 中使用 pip --version 查看你的 pip 版本。
如果没有,执行以下命令安装:
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
python3 get-pip.py --index-url https://mirrors.aliyun.com/pypi/simple/
确保你安装了 requests 库
如果没有,执行以下命令安装:
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple requests
以下是 python 源码:
import requests
import re
import time
import random
L = 13000 # 起始题号
R = 13020 # 结束题号
delay_between_requests = 1.5 # 每次请求间隔(秒)
休息间隔 = 50 # 每爬取多少题休息一次
休息时间 = 15 # 休息时间(秒)
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
COOKIES = {
"__client_id": "这是我的 cookie 绝对不能透露给他人", # 在此处填写 cookie
"_uid": "这是洛谷的 uid", # 在此处填写 uid
}
difficulty_map = {0: "灰",1: "红",2: "橙",3: "黄",4: "绿",5: "蓝",6: "紫",7: "黑"}
if L < R:
problem_numbers = range(L, R + 1)
else:
problem_numbers = range(R, L - 1, -1)
headers = {
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": "https://www.luogu.com.cn/"
}
data = []
accept_pattern = r'"acceptSolution":(true|false)'
difficulty_pattern = r'"difficulty":(\d)'
def generate_output(data):
can_submit = []
cannot_submit = []
errors = []
for item in data:
if item["error"]:
errors.append(item)
else:
if item["accept_status"]:
can_submit.append(item)
else:
cannot_submit.append(item)
def sort_key(item):
difficulty = item["difficulty"] if item["difficulty"] is not None else 999
return (difficulty, item["pid"])
can_submit_sorted = sorted(can_submit, key=sort_key)
cannot_submit_sorted = sorted(cannot_submit, key=sort_key)
output_lines = []
output_lines.append("===== 能交题解的题目 =====")
for item in can_submit_sorted:
diff = item["difficulty"]
diff_text = f"{diff}({difficulty_map.get(diff, '未知')})" if diff is not None else "未知"
output_lines.append(f"P{item['pid']}: 能交题解, 难度:{diff_text}")
output_lines.append("\n===== 不能交题解的题目 =====")
for item in cannot_submit_sorted:
diff = item["difficulty"]
diff_text = f"{diff}({difficulty_map.get(diff, '未知')})" if diff is not None else "未知"
output_lines.append(f"P{item['pid']}: 不能交题解, 难度:{diff_text}")
if errors:
output_lines.append("\n===== 爬取错误的题目 =====")
for item in errors:
output_lines.append(f"P{item['pid']}: {item['error']}")
return output_lines
try:
count = 0
for pid in problem_numbers:
count += 1
url = f"https://www.luogu.com.cn/problem/solution/P{pid}"
print(f"正在爬取 P{pid}...")
try:
response = requests.get(
url,
headers=headers,
cookies=COOKIES,
timeout=15
)
response.encoding = "utf-8"
item = {
"pid": pid,
"accept_status": None,
"difficulty": None,
"error": None
}
if response.status_code != 200:
item["error"] = f"状态码错误: {response.status_code}"
else:
accept_match = re.search(accept_pattern, response.text)
difficulty_match = re.search(difficulty_pattern, response.text)
if accept_match:
item["accept_status"] = accept_match.group(1).lower() == "true"
else:
item["error"] = "未找到acceptSolution属性"
if difficulty_match:
item["difficulty"] = int(difficulty_match.group(1))
else:
item["error"] = item["error"] + "; 未找到难度信息" if item["error"] else "未找到难度信息"
data.append(item)
print(f"P{pid} 处理完成")
if count % 休息间隔 == 0:
print(f"\n已爬取{count}题,休息{休息时间}秒...\n")
time.sleep(休息时间)
else:
time.sleep(delay_between_requests + random.uniform(0, 1))
except Exception as e:
error_msg = f"爬取失败: {str(e)}"
data.append({"pid": pid, "accept_status": None, "difficulty": None, "error": error_msg})
print(f"P{pid} 处理失败: {error_msg}")
time.sleep(3)
except Exception as e:
print("\n===== 程序发生严重错误 =====")
print(traceback.format_exc()) # 打印完整错误堆栈
finally:
output_lines = generate_output(data)
with open("record.txt", "w", encoding="utf-8") as f:
f.write("\n".join(output_lines))
print("\n程序结束,已生成 record.txt(包含已爬取的所有数据)")
input("按回车键关闭窗口...")
把源码保存成一个 .py 文件,名字自取。
带注释的是用户可以自己填写的内容。