odoochina
/
china_addons


			
							# -*- coding: utf-8 -*-

import requests
import pandas as pd
pd.set_option('display.max_columns', 10)
pd.set_option("max_colwidth", 200)
pd.set_option("display.width", 200)
pd.set_option('display.max_rows', 200)
from lxml import etree


url = "https://developer.work.weixin.qq.com/document/path/90313"
anchor = "//h2[@data-sign='12309549435022fd54b0549f5968b4c2']"
codes = "//h5"
methods ="//p[@data-type='p']"

page_text = requests.get(url=url).text
tree = etree.HTML(page_text)

anchor_elemen = tree.xpath(anchor)
codes_elements = tree.xpath(codes)
methods_elements = tree.xpath(methods)

methods = []
for code_element in codes_elements:
    code_element_str = code_element.xpath("text()")[0]
    error_code = code_element_str.split("：", 1)[1:][0]
    method_element = code_element.getnext()
    method = etree.tostring(method_element, encoding="utf-8", pretty_print=True ).decode()
    
    if " " in error_code:
        # 一个元素存在多个错误码
        multiple_codes = error_code.split(" ", 1)
        for multiple_code in multiple_codes:
            multiple_dic = {}
            multiple_dic["code"] = multiple_code
            multiple_dic["method"] = method
            methods.append(multiple_dic)
    else:
        dic = {}
        dic["code"] = error_code
        dic["method"] = method
        methods.append(dic)
        
table = tree.xpath("//div[@class='cherry-table-container']/table")  # 取出表格
table = etree.tostring(
    table[0], encoding="utf-8"
).decode()  # 将第一个表格转成string格式
table = table.replace("<th>错误码</th>", "<th>code</th>")
table = table.replace("<th>错误说明</th>", "<th>name</th>")
table = table.replace("<th>排查方法</th>", "<th>method</th>")

df = pd.read_html(table, encoding="utf-8", header=0)[0]  # pandas读取table
if 'Unnamed: 3' in df.columns:
    del df['Unnamed: 3']
error_results = list(df.T.to_dict().values())  # 转换成列表嵌套字典的格式

errors = []
for index, error in enumerate(error_results):
    error["sequence"] = index
    
    errors.append(error)
df = pd.DataFrame(errors)
print(df)