CSS / XPath / 正则选择器

源码: https://github.com/x-haose/hs-net/blob/main/examples/parsing/css_xpath.py

css_xpath.py
"""
CSS / XPath / 正则选择器

从 HTML 页面中提取数据的三种方式。
"""

from hs_net import SyncNet


def main():
    with SyncNet(retries=0) as net:
        resp = net.get("https://example.com")

        # --- CSS 选择器 ---
        # 获取页面标题
        title = resp.css("title::text").get()
        print(f"CSS 标题: {title}")
        # => Example Domain

        # 获取所有链接
        links = resp.css("a::attr(href)").getall()
        print(f"CSS 链接: {links}")
        # => ['https://www.iana.org/domains/example']

        # 获取链接文字
        link_text = resp.css("a::text").get()
        print(f"CSS 链接文字: {link_text}")
        # => More information...

        # --- XPath ---
        # 获取段落文本
        paragraphs = resp.xpath("//p/text()").getall()
        print(f"XPath 段落: {paragraphs}")

        # 带条件的 XPath
        link = resp.xpath("//a/@href").get()
        print(f"XPath 链接: {link}")

        # --- 正则 ---
        # 从文本中提取信息
        domain = resp.re_first(r"domain in the (\w+)")
        print(f"正则提取: {domain}")

        # 提取所有匹配
        words = resp.re(r"\b[A-Z][a-z]{5,}\b")
        print(f"正则所有匹配: {words}")

        # --- to_url: 相对路径转绝对路径 ---
        hrefs = resp.css("a::attr(href)").getall()
        absolute_urls = resp.to_url(hrefs)
        print(f"to_url 转换: {hrefs} -> {absolute_urls}")

        # 混合相对和绝对 URL,绝对 URL 原样保留
        mixed = ["/relative/path", "https://example.com/absolute"]
        print(f"to_url 混合: {resp.to_url(mixed)}")

        # 单个字符串也支持
        print(f"to_url 单个: {resp.to_url('/page')}")


if __name__ == "__main__":
    main()