#网页内容抓取
源码: https://github.com/x-haose/hs-net/blob/main/examples/real_world/scraping.py
scraping.py
"""
网页内容抓取
从网页中提取结构化数据。
"""
from hs_net import SyncNet
def main():
with SyncNet(retries=2, user_agent="chrome") as net:
resp = net.get("https://example.com")
# 提取标题
title = resp.css("title::text").get()
print(f"标题: {title}")
# 提取所有链接
for link in resp.css("a"):
href = link.css("::attr(href)").get()
text = link.css("::text").get()
print(f"链接: {text} -> {href}")
# 提取段落文本
paragraphs = resp.css("p::text").getall()
for p in paragraphs:
print(f"段落: {p}")
if __name__ == "__main__":
main()
