《Python数据挖掘与机器学习实战》—2.10.2 具体实现过程
2.10.2 具体实现过程
第1步需要编写的是Spider(蜘蛛),这是用户自定义用来解析网页并抓取指定URL返回内容的类,每个蜘蛛都能处理一个域名或一组域名。换句话说,就是用来定义特定网站的抓取和解析规则。这一步的具体操作如下所述。
(1)获取第一个URL的初始请求,当请求返回后调取一个回调函数。
start_urls = ['http://www.chinaticket.com/']
(2)获取其他票务网站的网址,用来爬取数据。
urls = {
'yanchanghui':"http://www.chinaticket.com/wenyi/yanchanghui/",
'huaju':"http://www.chinaticket.com/wenyi/huaju/",
'yinlehui':"http://www.chinaticket.com/wenyi/yinlehui/",
'yinleju':"http://www.chinaticket.com/wenyi/yinleju/",
'xiqu':"http://www.chinaticket.com/wenyi/xiqu/",
'baleiwu':"http://www.chinaticket.com/wenyi/baleiwu/",
'qinzijiating':"http://www.chinaticket.com/wenyi/qinzijiating/",
'zaji':"http://www.chinaticket.com/wenyi/zaji/",
'xiangshengxiaopin':"http://www.chinaticket.com/wenyi/xiangsheng
xiaopin/",
'zongyijiemu':"http://www.chinaticket.com/wenyi/zongyijiemu/",
'zuqiu':"http://www.chinaticket.com/tiyu/zuqiu/",
'gaoerfuqiu':"http://www.chinaticket.com/tiyu/gaoerfuqiu/",
'Cbalanqiu':"http://www.chinaticket.com/tiyu/Cbalanqiu/",
'saiche':"http://www.chinaticket.com/tiyu/saiche/",
'quanji':"http://www.chinaticket.com/tiyu/quanji/",
'dianyingpiao':"http://www.chinaticket.com/qita/dianyingpiao/",
'jingdianmenpiao':"http://www.chinaticket.com/qita/jingdianmenpiao/",
'zhanlan':"http://www.chinaticket.com/qita/zhanlan/",
'yundongxiuxian':"http://www.chinaticket.com/qita/yundongxiuxian/",
'lipinquan':"http://www.chinaticket.com/qita/lipinquan/",
'huiyi':"http://www.chinaticket.com/qita/huiyi/",
}
(3)编写页面请求函数,用于查看指定网页中的信息和内容。
def start_requests(self):
try:
for key,value in self.urls.items(): #请求页面的循环
yield Request(value.encode('utf-8'),meta= {"type":key.encode
('utf-8')}, callback = self.parse)
except Exception as err: #没有则报错
print (err) #会输出错误
(4)编写获取下一页面信息的函数,用于遍历所有页面。
def get_next_url(self):
try: #遍历所有页面
pass
except Exception as err: #没有则报错
print (err) #会输出错误
(5)编写票务网站页面解析函数,用于获取各种票务信息内容(价格、时间和地点等)。
def parse(self, response):
try:
item = TicketCrawlerItem()
meta = response.meta
result = response.text.encode("utf-8") #编码格式为UTF-8
if result == '' or result == 'None': #页面结果为空
print ("Can't get the sourceCode ") #报告没有信息
sys.exit()
tree = etree.HTML(result) #存放结果
data = []
#演出条数
page = tree.xpath("//*[@class='s_num']/text()")[1].replace("\n","").
replace("",""). encode("utf-8")
#页数
calculateNum = calculatePageNumber()
pageNUM = calculateNum.calculate_page_number(page)
count = (pageNUM/10)+1
listDoms = tree.xpath("//*[@class='s_ticket_list']//ul")
if(listDoms):
for itemDom in listDoms: #循环遍历
# #数据存放
item['type'] = meta['type'].encode("utf-8")
try:
titleDom = itemDom.xpath("li[@class='ticket_list_tu
fl']/a/text()")
if(titleDom[0]): #检查标题情况
item['name'] = titleDom[0].encode("utf-8")
except Exception as err: #没有则报错
print (err) #会输出错误
try:
urlDom = itemDom.xpath("li[@class='ticket_list_tu
fl']/a/@href")
if(urlDom[0]): #检查票务信息
item['url'] = urlDom[0].encode("utf-8")
except Exception as err: #没有则报错
print (err) #会输出错误
try:
timeDom = itemDom.xpath("li[@class='ticket_list_tu fl']/
span[1]/text()")
if(timeDom[0]): #检查时间信息
item['time'] = timeDom[0].encode("utf-8").replace
('时间:','')
except Exception as err: #没有则报错
print (err) #会输出错误
try:
addressDom = itemDom.xpath("li[@class='ticket_list_tu fl']/
span[2] /text()")
if(addressDom[0]): #检查地点信息
item['address'] = addressDom[0].encode("utf-8").
replace('地点:','')
except Exception as err: #没有则报错
print (err) #会输出错误
try:
priceDom = itemDom.xpath("li[@class='ticket_list_tu
fl']/span[3] / text()")
if(priceDom[0]): #检查票价信息
item['price'] = priceDom[0].encode("utf-8").replace
('票价:','')
except Exception as err: #没有则报错
print (err) #会输出错误
yield item
for i in range(2,count+1): #循环操作,用于不断获取下一票务页面的信息
next_page = "http://www.chinaticket.com/wenyi/" + str(meta
['type'])+ "/?o = 2&page = "+str(i)
if next_page is not None: #检查是否还有未爬取的页面
yield scrapy.Request(next_page, meta={"type":meta['type']},
callback=self.parse)
except Exception as err: #没有则报错
print (err) #会输出错误
第2步是编写settings,即爬虫设置,包括爬虫项目的名称,爬虫模块说明及MySQL数据库的配置信息。
BOT_NAME = 'ticketCrawler' # 项目名称
#爬虫模块说明,引擎根据这个信息找到爬虫
SPIDER_MODULES = ['ticketCrawler.spiders']
NEWSPIDER_MODULE = 'ticketCrawler.spiders'
#是否遵守robots协议,默认是遵守的,可以改成False
ROBOTSTXT_OBEY = False
SPIDER_MIDDLEWARES = {
'ticketCrawler.middlewares.TicketcrawlerSpiderMiddleware': 543,}
#将发返回的items写入数据库和文件等持久化模块中
ITEM_PIPELINES = {
'ticketCrawler.pipelines.TicketcrawlerPipeline': 300,}
#MySQL数据库的配置信息
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'ticketCrawler' #数据库名称
MYSQL_USER = 'root' #数据库账号
MYSQL_PASSWORD = 'xxxxxx' #数据库密码
MYSQL_CHARSET = 'utf-8' #编码格式UTF-8
第3步是编写items,item类中提供了容器来收集这些爬取的数据并定义这些数据的存储格式。
import scrapy
class TicketCrawlerItem(scrapy.Item): #在此处定义项目的字段
#名称
name = scrapy.Field()
#时间
time = scrapy.Field()
#地点
address = scrapy.Field()
#价格
price = scrapy.Field()
#演出类型
type = scrapy.Field()
#链接
url = scrapy.Field()
#内容简介
introduction = scrapy.Field()
第4步是编写pipelines,用于数据处理行为,如一般结构化的数据持久化。
#导入相应的库
import pymysql
from scrapy import log
import settings
from items import TicketCrawlerItem
from sqlalchemy import create_engine,Table,Column
from sqlalchemy import Integer,String,MetaData,VARCHAR,DATETIME,TEXT
#票概要信息
class TicketcrawlerPipeline(object):
def __init__(self):
try:
# 数据库连接
self.connect = pymysql.connect(
host=settings.MYSQL_HOST,
db = settings.MYSQL_DBNAME,
user = settings.MYSQL_USER,
passwd = settings.MYSQL_PASSWORD,
charset = settings.MYSQL_CHARSET,
)
#游标
self.cursor = self.connect.cursor()
except Exception as err:
print err
#处理数据
def process_item(self, item, spider):
if item.__class__==TicketCrawlerItem:
try:
#创建数据表,若表存在则忽略
self.createTable()
#插入SQL语句
sqlInsert = "INSERT INTO ticketCrawler.tickets(name,price,
time,address,type,url)values(%s,%s,%s,%s,%s,%s)"
self.insertIntoTable(item['url'],sqlInsert,item)
except Exception as err:
print err
return item
#创建表
def createTable(self):
try:
#创建连接
engine = create_engine("mysql+mysqldb://root:xxxxxx@127.0.0.1:
3306/ticketCrawler?charset=utf8", max_overflow=10000)
#获取元数据
metadata = MetaData()
#定义表,包括各种票务信息
tickets = Table('tickets',metadata,
Column('id',Integer,primary_key=True),
Column('name',VARCHAR(256)),
Column('price',VARCHAR(256)),
Column('time',VARCHAR(256)),
Column('address',VARCHAR(256)),
Column('type',VARCHAR(256)),
Column('url',VARCHAR(256)),
Column('introduction',TEXT),
Column('last_update_time',DATETIME))
metadata.create_all(engine)
except Exception as err:
print err
#插入数据表
def insertIntoTable(self,url,sql,item):
try:
engine = create_engine("mysql+mysqldb://root:xxxxx@127.0.0.1:
3306/ticketCrawler?charset=utf8",max_overflow=10000)
#插入去重判断
selectSql = "SELECT COUNT(*) FROM tickets WHERE url = '%s'" % url
result = engine.execute(selectSql)
count_exist = result.fetchall()
if count_exist[0][0]>=1:
print ("数据表中已有数据")
else:
engine.execute(sql, (item['name'],item['price'],item ['time'],item
['address'], item['type'],item['url']))
except Exception as err:
print(err) #报错
#更新数据表
def updateTable(self):
try:
pass
except Exception as err:
print (err) #报错
最后一步,运行爬虫程序即可。
- 点赞
- 收藏
- 关注作者
评论(0)