xdd.github.io/3.BeautifulSoup4和jsonpath.md at master · 1263351411/xdd.github.io

遍历文档树

在文档树中找到关心的内容才是日常的工资，也就是说如何遍历树中的节点。使用上面的test.html来测试

使用Tag

soup.div可以找到从根节点开始查找第一个div节点,返回一个Tag对象

soup.div.p说明从根节点开始找到第一个div后返回一个Tag对象，这个Tag对象下继续找第一个p，找到返回Tag对象

soup.p返回了文字“字典”，而不是文字“bottom"说明遍历时 深度优先 ，返回也是Tag对象

遍历直接子节点

Tag.contents #将对象的所有类型直接子节点以列表方式输出

Tag.children #返回子节点的迭代器

Tag.children #等价于Tag.contents

from bs4 . element import Tag with open ( "d://xdd.html" , encoding = "utf-8" ) as f : soup = BeautifulSoup ( f , "lxml" ) print ( soup . p . string ) print ( soup . div . contents ) #直接子标签列表 print ( "- " * 30 ) for i in soup . div . children : #直接子标签可迭代对象 print ( i . name ) print ( "- " * 30 ) print ( list ( map ( lambda x : x . name if x . name else x , soup . div . descendants #所有子孙

遍历字符串

在前面的例子中，soup.div.string返回None，是因为string要求soup.div只能有一个NavigableString类型子节点，也就是这样


    <div>only string</div>

。

Tag.string #获取Tag下的string对象，如果多余1个结点返回None

Tag.strings #返回迭代器，带多余的空白字符。所有的string对象

Tag.stripped_strings #返回，会去除多余空白字符

from bs4 import BeautifulSoup
from bs4.element import Tag
with open("d://xdd.html",encoding="utf-8") as f:
    soup = BeautifulSoup(f,"lxml")
    print(soup.div.string) #返回None，因为多余1个子节点




    

    print("- "*30)
    print("".join(soup.div.strings).strip()) #返回迭代器，带多余的空白字符
    print("- "*30)
    print("".join(soup.div.stripped_strings)) #返回迭代器，去除多余空白字符

遍历祖先节点

BeautifulSoup.parent #获取根节点的父结点，必定返回None,根节点没有父结点

Tag.parent #获取第一个Tag的父结点

Tag.parent.parent.get("id") #获取第一个tag的父结点的父结点的id属性

Tag.parents #获取Tag节点的所有父结点，由近及远

from bs4 import BeautifulSoup
from bs4.element import Tag
with open("d://xdd.html",encoding="utf-8") as f:
    soup = BeautifulSoup(f,"lxml")
    print(type(soup))
    print(soup.parent)
    print(soup.div.parent.name) #body ,第一个div的父节点
    print(soup.p.parent.parent.get("id")) #取id属性， main
    print("- "*30)
    print(list(map(lambda x:x.name,soup.p.parents))) #父迭代器，由近及远

遍历兄弟节点

Tag.next_sibling #第一个Tag元素的下一个(下面)兄弟节点，注意：可能是一个文本节点

Tag.previous_sibling #第一个Tag元素之前的兄弟节点(上面)，注意：可能是一个文本节点

Tag.next_siblings #获取Tag元素的下面的所有兄弟节点

from bs4 import BeautifulSoup
from bs4.element import Tag
with open("d://xdd.html",encoding="utf-8") as f:
    soup = BeautifulSoup(f,"lxml")
    print(type(soup),type(soup.p))
    print("{} [{}]".format(1,soup.p.next_sibling.encode()))
    print("{} [{}]".format(2,soup.p.previous_sibling.encode()))
    print(soup.p.previous_sibling.next_sibling) #等价于soup.p
    print(soup.p.next_sibling.previous_sibling)  # 等价于soup.p
    print(soup.p)
    print(list(soup.p.next_siblings))

遍历其他元素

Tag.next_element #是下一个可被解析的对象(字符串或tag),和下一个兄弟节点next_sibling不一样

Tag.next_elements #返回所有下一个可被解析的对象，是个可迭代对象。

from bs4 import BeautifulSoup
with open("d://xdd.html",encoding="utf-8") as f:
    soup = BeautifulSoup(f,"lxml")
    print(type(soup),type(soup.p))
    print(soup.p.next_element) #返回"字典"2个字
    print(soup.p.next_element.next_element.encode())
    print(soup.p.next_element.next_element.next_element)
    print(list(soup.p.next_elements))
    print("- "*30)
    #对比差异
    print(list(soup.p.next_elements))
    print(list(soup.p.next_siblings))

搜索文档树

find系有很多分发，请执行查询帮助 https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#id25

find_all(name=None,attrs={},recursive=True,text=None,limit=None,**kwargs) #立即返回一个列表

正则表达式对象 :按照"正则表达式对象"的模式匹配标签名

import re
print(soup.find_all(re.compile("^h\d"))) #标签名以h开头后接数字

列表：或关系查找列表中的每个字符串

print(soup.find_all(["p","h1","h3"])) #或关系，找出列表所有的标签
print(soup.find_all(re.compile(r"^p|h|\d$"))) #使用正则表达式完成

True或None ,则find_all返回全部非字符串节点、非注释节点，就是Tag标签类型

from bs4 import BeautifulSoup
with open("d://xdd.html",encoding="utf-8") as f:
    soup = BeautifulSoup(f,"lxml")
    print(list(map(lambda x: x.name, soup.find_all(True))))
    print(list(map(lambda x: x.name, soup.find_all(None))))
    print(list(map(lambda x: x.name, soup.find_all())))

from bs4 import BeautifulSoup
from bs4.element import Tag
with open("d://xdd.html",encoding="utf-8") as f:
    soup = BeautifulSoup(f,"lxml")
    values = [True,None,False]
    for value in values:
        all = soup.find_all(value)
        print(type(all[0]))
        print(len(all))
    print("- "*30)
    count = 0
    for i,t in enumerate(soup.descendants): #遍历所有类型的子孙节点
        print(i,type(t),t.name)
        if isinstance(t,Tag): #只对Tag类型计数
            count += 1
    print(count)
# 数目一致，所以返回的是Tag类型的节点，源码中确实返回的Tag类型

如果使用以上过滤器还不能提取想要的节点，可以使用函数，此函数仅只能 接收一个参数 。

如果这个函数返回True,表示当前节点配置；返回False则是不匹配。

示例：找出所有class属性且有多个值的节点(测试html中符合这个条件只有h3标签)

from




    
 bs4 import BeautifulSoup
from bs4.element import Tag
def many_classes(tag:Tag):
    # print(type(tag))
    # print(type(tag.attrs))
    return len(tag.attrs.get("class",[])) > 1
with open("d://xdd.html",encoding="utf-8") as f:
    soup = BeautifulSoup(f,"lxml")
    print(soup.find_all(many_classes))

with open ( "d://xdd.html" , encoding = "utf-8" ) as f : soup = BeautifulSoup ( f , "lxml" ) print ( soup . find_all ( id = "first" )) #id为first的所有结点列表 print ( 1 , "- " * 30 ) print ( soup . find_all ( id = re . compile ( "\w+" ))) #相当于找有di的所有节点 print ( 2 , "- " * 30 ) print ( soup . find_all ( id = True )) #所有有id的节点 print ( 3 , "- " * 30 ) print ( list ( map ( lambda x : x [ "id" ], soup . find_all ( id = True )))) print ( 4 , "- " * 30 ) print ( soup . find_all ( id = [ "first" , re . compile ( r"^sec" )])) #指定id的名称列表 print ( 5 , "- " * 30 ) print ( soup . find_all ( id = True , src = True )) #相当于条件and,既有id又有src属性的节点列表

css的class的特殊处理

class是Python关键字，所以使用


    class_

。class是多值属性，可以匹配其中任意一个，也可以完全匹配。

print(soup.find_all(class_="content"))
print(soup.find_all(class_="title")) #可以使用任意一个css类
print(soup.find_all(class_="highlight")) #可以使用任意一个css类
print(soup.find_all(class_="highlight title")) #顺序错了，找不到
print(soup.find_all(class_="title highlight")) #顺序一致，找到。就是字符串完全匹配

attrs参数

attrs接收一个字典，字典的key为属性名，value可以是字符串、正则表达式对象、True、列表。可以多个属性

print(soup.find_all(attrs={"class":"title"}))
print(soup.find_all(attrs={"class":"highlight"}))
print(soup.find_all(attrs={"class":"title highlight"}))
print(soup.find_all(attrs={"id":True}))
print(soup.find_all(attrs={"id":re.compile(r"\d$")}))
print(list(map(lambda x:x.name,soup.find_all(attrs={"id":True,"src":True}))))

text参数

可以通过text参数搜索文档中的字符串内容，接受字符串、正则表达式对象、True、列表

from bs4 import BeautifulSoup
import re
with open("d://xdd.html",encoding="utf-8") as f:
    soup = BeautifulSoup(f,"lxml")
    print(list(map(lambda x:(type(x),x),soup.find_all(text=re.compile("\w+"))))) #返回文本类节点
    print("- "*30)
    print(list(map(lambda x:(type(x),x),soup.find_all(text=re.compile("[a-z]+")))))
    print("- "*30)
    print(soup.find_all(re.compile(r"^(h|p)"),text=re.compile("[a-z]+"))) #相当于过滤Tag对象，并看它的string是否符合text参数要求，返回Tag对象

limit参数 ：显示返回结果的数量

print(soup.find_all(id=True,limit=3)) #返回列表中有3个结果

recursive参数

默认是递归搜索所有子孙节点，如果不需要请设置为False

with open ( "d://xdd.html" , encoding = "utf-8" ) as f : soup = BeautifulSoup ( f , "lxml" ) print ( soup ( "img" )) #所有img标签对象的列表，等价于soup.find_all("img") print ( soup . img ) #深度优先第一个img print ( soup . a . find_all ( text = True )) #返回文本 print ( soup . a ( text = True )) #返回文本，和上面等价 print ( soup ( "a" , text = True )) #返回a标签对象 print ( soup . find_all ( "img" , attrs = { "id" : "bg1" })) print ( soup ( "img" , attrs = { "id" : "bg1" })) #find_all的省略 print ( soup ( "img" , attrs = { "id" : re . compile ( "1" )}))

find方法


    find(name,attrs,recursive,text,**kwargs)

参数几乎和find_all一样。

找到了，find_all返回一个列表，而find返回一个单值，元素对象。

找不到，find_all返回一个空列表，而find返回一个None。

from bs4 import BeautifulSoup
with open("d://xdd.html",encoding="utf-8") as f:
    soup =




    
 BeautifulSoup(f,"lxml")
    print(soup.find("img",attrs={"id":"bg1"}).attrs.get("src","xdd"))
    print(soup.find("img",attrs={"id":"bg1"}).get("src")) #简化了attrs
    print(soup.find("img",attrs={"id":"bg1"})["src"])

CSS选择器

和JQuery一样，可以使用CSS选择器来查找节点

使用soup.select()方法，select方法支持大部分CSS选择器，返回列表。

CSS中，标签名直接使用，类名前加

点号,id名前加

井号。

BeautifulSoup.select("css选择器")

from bs4 import BeautifulSoup
with open("d://xdd.html",encoding="utf-8") as f:
    soup = BeautifulSoup(f,"lxml")
    #元素选择器
    print(1,soup.select("p")) #所有的p标签
    #类选择器
    print(2,soup.select(".title"))
    #使用了伪类
    #直接子标签是p的同类型的所有p标签中的第二个
    #(同类型)同标签名p的第2个，伪类只实现了nth-of-type，且要求是数字
    print(3,soup.select("div.content >p:nth-of-type(2)"))
    # id选择器
    print(4,soup.select("p#second"))
    print(5,soup.select("#bg1"))
    #后代选择器
    print(6,soup.select("div p")) # div下逐层找p
    print(7,soup.select("div div p")) #div下逐层找div下逐层找p
    #子选择器，直接后代
    print(8,soup.select("div > p")) #div下直接子标签的p，有2个
    #相邻兄弟选择器
    print(9, soup.select("div p:nth-of-type(1) + [src]")) #返回[]
    print(9, soup.select("div p:nth-of-type(1) + p"))  # 返回p标签
    print(9, soup.select("div > p:nth-of-type(2) + input"))  # 返回input Tag
    print(9, soup.select("div > p:nth-of-type(2) + [type]"))  # 同上
    #普通兄弟选择器
    print(10, soup.select("div p:nth-of-type(1) ~ [src]")) #返回2个img
    #属性选择器
    print(11,soup.select("[src]")) #有属性src
    print(12,soup.select("[src='/']")) #属性src等于/
    print(13,soup.select("[src='http://www.xdd.com/']")) #完全匹配
    print(14,soup.select("[src^='http://www']")) #以http://www开头
    print(15,soup.select("[src$='com/']")) #以com/结尾
    print(16,soup.select("img[src*='xdd']")) #包含xdd
    print(17,soup.select("img[src*='.com']")) #包含.com
    print(18,soup.select("[class='title highlight']")) #完全匹配calss等于'title highlight'
    print(19,soup.select("[class~=title]")) #多值属性中有一个title

获取文本内容

搜索节点的目的往往是为了提取该节点的文本内容，一般不需要HTML标记，只需要文字

from bs4 import BeautifulSoup
with open("d://xdd.html",encoding="utf-8") as f:
    soup = BeautifulSoup(f,"lxml")
    # 元素选择器
    ele = soup.select("div") #所有的div标签
    print(type(ele))
    print(ele[0].string) #内容仅仅只能是文本类型，否则返回None
    print(list(ele[0].strings)) #迭代保留空白字符
    print(list(ele[0].stripped_strings)) #迭代不保留空白字符
    print("- "*30)
    print(ele[0])
    print("- " * 30)
    print(list(ele[0].text))#本质上就是get_text(),保留空白字符的strings
    print(list(ele[0].get_text())) #迭代并join，保留空白字符，strip默认为False
    print(list(ele[0].get_text(strip=True))) #迭代并join，不保留空白字符

bs4.element.Tag#string源码

class Tag(PageElement):
@property
    def string(self):
        if len(self.contents) != 1:
            return None
        child = self.contents[0]
        if isinstance(child, NavigableString):
            return child
        return child.string
    @string.setter
    def string(self, string):
        self.clear()
        self.append(string.__class__(string))
    def _all_strings(self, strip=False, types=(NavigableString, CData)):
        for descendant in self.descendants:
                (types is None and not isinstance(descendant




    
, NavigableString))
                (types is not None and type(descendant) not in types)):
                continue
            if strip:
                descendant = descendant.strip()
                if len(descendant) == 0:
                    continue
            yield descendant
    strings = property(_all_strings)
    @property
    def stripped_strings(self):
        for string in self._all_strings(True):
            yield string
    def get_text(self, separator="", strip=False,
                 types=(NavigableString, CData)):
        return separator.join([s for s in self._all_strings(
                    strip, types=types)])
    getText = get_text
    text = property(get_text)

Json解析

拿到一个Json字符串，如果想提取其中的部分内容，就需要遍历了。在遍历过程中进行判断。

还有一种方式，类似于XPath,叫做jsonPath。

安装


    pip install jsonpath

官网 https://goessner.net/articles/JsonPath/

综合示例，使用豆瓣电影的热门电影的Json https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=10&page_start=0

"subjects" :[ "rate" : " 8.8 " , "cover_x" : 1500 , "title" : " 寄生虫 " , "url" : " https://movie.douban.com/subject/27010768/ " , "playable" : false , "cover" : " https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2561439800.jpg " , "id" : " 27010768 " , "cover_y" : 2138 , "is_new" : false "rate" : " 7.7 " , "cover_x" : 1500 , "title" : " 恶人传 " , "url" : " https://movie.douban.com/subject/30211551/ " , "playable" : false , "cover" : " https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2555084871.jpg " , "id" : " 30211551 " , "cover_y" : 2145 , "is_new" : false "rate" : " 6.6 " , "cover_x" : 1500 , "title" : " 异地母子情 " , "url" : " https://movie.douban.com/subject/26261189/ " , "playable" : false , "cover" : " https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2562107493.jpg " , "id" : " 26261189 " , "cover_y" : 2222 , "is_new" : true "rate" : " 6.7 " , "cover_x" : 2025 , "title" : " 我的生命之光 " , "url" : " https://movie.douban.com/subject/26962841/ " , "playable" : false , "cover" : " https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2563625370.jpg " , "id" : " 26962841 " , "cover_y" : 3000 , "is_new" : true "rate" : " 7.3 " , "cover_x" : 2025 , "title" : " 皮肤 " , "url" : " https://movie.douban.com/subject/27041467/ " , "playable" : false , "cover" : " https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2559479239.jpg " , "id" : " 27041467 " , "cover_y" : 3000 , "is_new" : true "rate" : " 8.9 " , "cover_x" : 2000 , "title" : " 绿皮书 " , "url" : " https://movie.douban.com/subject/27060077/ " , "playable" : true , "cover" : " https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2549177902.jpg " , "id" : " 27060077 " , "cover_y" : 3167 , "is_new" : false "rate" : " 8.0 " , "cover_x" : 3600 , "title" : " 疾速备战 " , "url" : " https://movie.douban.com/subject/26909790/ " , "playable" : false , "cover" : " https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2551393832.jpg " , "id" : " 26909790 " , "cover_y" : 5550 , "is_new" : false "rate" : " 7.9 " , "cover_x" : 1786 , "title" : " 流浪地球 " , "url" : " https://movie.douban.com/subject/26266893/ " , "playable" : true , "cover" : " https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2545472803.jpg " , "id" : " 26266893 " , "cover_y" : 2500 , "is_new" : false "rate" : " 8.2 " , "cover_x" : 684 , "title" : " 沦落人 " , "url" : " https://movie.douban.com/subject/30140231/ " , "playable" : false , "cover" : " https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2555952192.jpg " , "id" : " 30140231 " , "cover_y" : 960 , "is_new" : false "rate" : " 6.4 " , "cover_x" : 960 , "title" : " 疯狂的外星人 " , "url" : " https://movie.douban.com/subject/25986662/ " , "playable" : true , "cover" : " https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2541901817.jpg " , "id" : " 25986662 " , "cover_y" : 1359 , "is_new" : false

from jsonpath import jsonpath
import requests
import json
ua = "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN) AppleWebKit/537.36 (KHTML, like Gecko) Version/5.0.1 Safari/537.36"
url = "https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=10&page_start=0"
with requests.get(url,headers={"User-agent":ua}) as response:
    if response.status_code==200:
        text = response.text
        print(text[:100])
        js = json.loads(text)
        print(str(js)[:100]) #json转换为Python数据结构
        #知道所有电影的名称
        rs1 = jsonpath(js,"$..title") #从根目录开始，任意层次的title属性
        print(rs1)
        #找到所有subjects
        rs2 = jsonpath(js,"$..subjects")
        print(len(rs2),str(rs2[0])[:100]) #由于太长，取前100个字符
        print("- " * 30)
        # 找到所有得分高于8分的电影名称
        # 根下任意层的subjects的子节点rate大于字符串8
        rs3 = jsonpath(js,'$..subjects[?(@.rate > "8")]') #？()是过滤器
        print(rs3)
        print("- "*30)
        #根下任意层的subjects的子节点rate大于字符串8的节点的子节点title
        rs4 = jsonpath(js,'$..subjects[?(@.rate > "8")].title')
        print(rs4)
        print("- " * 30)
        rs5 = jsonpath(js,"$..subjects[?(@.rate > '6')].title")
        print(rs5[:2])