中国本土应用
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

65 lines
2.1KB

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. import pandas as pd
  4. pd.set_option('display.max_columns', 10)
  5. pd.set_option("max_colwidth", 200)
  6. pd.set_option("display.width", 200)
  7. pd.set_option('display.max_rows', 200)
  8. from lxml import etree
  9. url = "https://developer.work.weixin.qq.com/document/path/90313"
  10. anchor = "//h2[@data-sign='12309549435022fd54b0549f5968b4c2']"
  11. codes = "//h5"
  12. methods ="//p[@data-type='p']"
  13. page_text = requests.get(url=url).text
  14. tree = etree.HTML(page_text)
  15. anchor_elemen = tree.xpath(anchor)
  16. codes_elements = tree.xpath(codes)
  17. methods_elements = tree.xpath(methods)
  18. methods = []
  19. for code_element in codes_elements:
  20. code_element_str = code_element.xpath("text()")[0]
  21. error_code = code_element_str.split(":", 1)[1:][0]
  22. method_element = code_element.getnext()
  23. method = etree.tostring(method_element, encoding="utf-8", pretty_print=True ).decode()
  24. if " " in error_code:
  25. # 一个元素存在多个错误码
  26. multiple_codes = error_code.split(" ", 1)
  27. for multiple_code in multiple_codes:
  28. multiple_dic = {}
  29. multiple_dic["code"] = multiple_code
  30. multiple_dic["method"] = method
  31. methods.append(multiple_dic)
  32. else:
  33. dic = {}
  34. dic["code"] = error_code
  35. dic["method"] = method
  36. methods.append(dic)
  37. table = tree.xpath("//div[@class='cherry-table-container']/table") # 取出表格
  38. table = etree.tostring(
  39. table[0], encoding="utf-8"
  40. ).decode() # 将第一个表格转成string格式
  41. table = table.replace("<th>错误码</th>", "<th>code</th>")
  42. table = table.replace("<th>错误说明</th>", "<th>name</th>")
  43. table = table.replace("<th>排查方法</th>", "<th>method</th>")
  44. df = pd.read_html(table, encoding="utf-8", header=0)[0] # pandas读取table
  45. if 'Unnamed: 3' in df.columns:
  46. del df['Unnamed: 3']
  47. error_results = list(df.T.to_dict().values()) # 转换成列表嵌套字典的格式
  48. errors = []
  49. for index, error in enumerate(error_results):
  50. error["sequence"] = index
  51. errors.append(error)
  52. df = pd.DataFrame(errors)
  53. print(df)
上海开阖软件有限公司 沪ICP备12045867号-1