xpath爬虫-抓取全国行政区划和城乡区划数据
【摘要】 数据来源地址:2020年度全国行政区划和城乡划代码示例:以安徽省合肥市为例import requestsfrom lxml import etreeimport pandas as pddef get_html(url): header = {'user-agent': '你自己的浏览器信息'} try: response = requests.get(url, h...
数据来源地址:2020年度全国行政区划和城乡划
代码示例:以安徽省合肥市为例
import requests
from lxml import etree
import pandas as pd
def get_html(url):
header = {'user-agent': '你自己的浏览器信息'}
try:
response = requests.get(url, headers=header)
# 判断网页是否正确返回
if response.status_code == 200:
return response.content.decode('gbk')
else:
print("{0}网页请求状态码错误!{0}".format("-" * 10))
except Exception as e:
print("{0}请求参数出现错误:{1}{0}".format("-" * 10, e))
def parse_url(url, xpath_path):
html = get_html(url)
# 构建下一级跳转初始url部分
next_base_url = "/".join(url.split("/")[:-1])
# 初始化
HTML = etree.HTML(html)
# 获取区级名称和对应下一级链接
all_area = HTML.xpath(f'{xpath_path}/text()')
next_link = HTML.xpath(f'{xpath_path}/@href')
return [(i[0], next_base_url + "/" + i[1]) for i in list(zip(all_area, next_link))]
def parse_url2(url, xpath_path):
"""最后一级,无跳转链接"""
html = get_html(url)
# 初始化
HTML = etree.HTML(html)
villagetr = HTML.xpath(f'{xpath_path}/text()')
return villagetr
result = []
xpath_path = '//tr[@class="countytr"]/td[2]/a'
url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/34/3401.html"
# 市 get ==》 区:名字&链接
for i in parse_url(url, xpath_path):
area1, url = i
xpath_path = '//tr[@class="towntr"]/td[2]/a'
# 区 get ==》 镇:名字&链接
for j in parse_url(url, xpath_path):
area2, url = j
xpath_path = '//tr[@class ="villagetr"]/td[3]'
# 镇 get ==》 街道:名字
for k in parse_url2(url, xpath_path):
result.append([area1, area2, k])
df = pd.DataFrame(result, columns=["区", "镇/街道", "居委会"])
df.to_excel("合肥市行政区域划分.xlsx", index=False)
【版权声明】本文为华为云社区用户原创内容,未经允许不得转载,如需转载请自行联系原作者进行授权。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)