一、首先我们所需要的库
1、resquests
2、re
二、创建一个scrapy
scrapy startproject ArticleSpider
scrapy genspider jobbole www.jobbole.com
三、爬取所有文章列表的url以及爬取下一页
#所要爬取链接的起始url
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
#获取第一页下的所有a标签
url_list=response.css('#archive .floated-thumb .post-thumb a')
for url in url_list:
#获取img标签下的src属性,图片的链接dizhi
url_img=url.css('img::attr(src)').extract_first()
#获取每一个url_list的详情页面的url
urls=url.xpath('@href')[0].extract()
#通过parse.urljoin传递一个绝对地址
#通过meta属性向item中添加font_image_url属性
yield scrapy.Request(parse.urljoin(response.url,urls),meta={'font_image_url':url_img},callback=self.get_parse)
#获取下一页的链接地址、
next=response.css('.next.page-numbers::attr(href)')[0].extract()
#一直循环,知道没有下一页为止
if next:
#回调parse函数
yield scrapy.Request(parse.urljoin(response.url,next),callback=self.parse)
else:
return None
四、对详情文章页面进行分析
我们要的数据是文章的
[标题,日期,标签,文章,点赞数,收藏数,评论数]
#标题
title= response.css('.grid-8 .entry-header h1::text')[0].extract()
#日期
data=response.css('.grid-8 .entry-meta p::text')[0].extract().strip().replace('·','').strip()
#标签
tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
tag_list = [element for element in tag_list if not element.strip().endswith("评论")]#删除标签内的评论数
tags = ",".join(tag_list)#把标签数组以','链接生成字符串
#文章
article= response.css('.grid-8 .entry')[0].extract()
#点赞数
votetotal=response.css('.post-adds h10::text')[0].extract()
match_re = re.match('.*(\d+).*', votetotal)#用正则筛选出我们所需要的数字
if match_re:
votetotal=int(match_re.group(1))#返回第一个
else:
votetotal=0#如国没有则默认为0
#收藏数
bookmark=response.css('.post-adds .bookmark-btn::text')[0].extract()
match_re = re.match('.*(\d+).*', bookmark)
if match_re:
bookmark=int(match_re.group(1))
else:
bookmark=0
#评论数
comments=response.css('.post-adds a .hide-on-480::text')[0].extract()
match_re = re.match('.*(\d+).*', comments)
if match_re:
comments=int(match_re.group(1))
else:
comments=0
get_parse方法来获取详情页的数据
def get_parse(self,response):
#接收传过来的 font_iamgge_url
image_url=response.meta.get('font_image_url','')
title= response.css('.grid-8 .entry-header h1::text')[0].extract()
data=response.css('.grid-8 .entry-meta p::text')[0].extract().strip().replace('·','').strip()
category=response.css('.grid-8 .entry-meta p a::text')[0].extract()
tag=response.css('.grid-8 .entry-meta p a::text')[-1].extract().strip().replace('·','').strip()
article= response.css('.grid-8 .entry')[0].extract()
votetotal=response.css('.post-adds h10::text')[0].extract()
match_re = re.match('.*(\d+).*', votetotal)
if match_re:
votetotal=int(match_re.group(1))
else:
votetotal=0
bookmark=response.css('.post-adds .bookmark-btn::text')[0].extract()
match_re = re.match('.*(\d+).*', bookmark)
if match_re:
bookmark=int(match_re.group(1))
else:
bookmark=0
comments=response.css('.post-adds a .hide-on-480::text')[0].extract()
match_re = re.match('.*(\d+).*', comments)
if match_re:
comments=int(match_re.group(1))
else:
comments=0
#对item对象实例化
item=ArticlespiderItem()
item['url']=response.url
#调用md5把url压缩为固定的哈希值
item['url_object_id'] = get_md5(response.url)
item['image_url']=[image_url]
#调用dtaetime库把字符串转化为date属性
try:
data=datetime.datetime.strftime(data,"%Y/%m/%d").date()
except Exception as e:
#如果有异常就获取当前系统的时间
data=datetime.datetime.now().date()
item['data']=data
item['title'] = title
item['category'] = category
item['tag'] = tag
item['article'] = article
item['votetotal'] = votetotal
item['bookmark']=bookmark
item['comments'] = comments
五、下载每一篇文章的图片
在settings.py文件中加入
#获取item中iamge_url的图片链接
IMAGES_URLS_FIELD='image_url'
#获取当前的文件路径
object_url=os.path.abspath(os.path.dirname(__file__))
#创建image文件夹来存储图片
IMAGES_STORE=os.path.join(object_url,'image')
在image文件下会自动生成图片的名字,在ImagesPipeline中我们会找到path变量,我们可以找到每个url所对应的图片的名字,把它存到item中
class ArticleImagePipeline(ImagesPipeline):
def item_completed(self, results, item, info):
if 'image_url' in item:
for ok,value in results:
image_file_path=value['path']
item['image_url_path']=image_file_path
return item
六、随着以后爬取速度加快,存的速度赶不上爬的速度,导致存的堆积影响性能,所以使用twisted将musql变成异步操作
from twisted.enterprise import adbapi
class MysqlTwistePipline(object):
def __init__(self,dbpool):
self.dbpool=dbpool
@classmethod
def from_settings(cls,settings):
dbparms=dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DB'] ,
user=settings['MYSQL_USER'] ,
passwd=settings['MYSQL_PASSWORD'] ,
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
dbpool=adbapi.ConnectionPool('MySQLdb',**dbparms)
return cls(dbpool)
def process_item(self , item , spider):
#使用twisted将musql变成异步操作
query=self.dbpool.runInteraction(self.do_insert,item)
query.addErrback(self.hand_erro)
def hand_erro(self,failure):
print(failure)
def do_insert(self,cursor,item):
url = item['url']
url_object_id = item['url_object_id']
image_urls = item['image_url']
image_url = image_urls[0]
image_url_path = item['image_url_path']
title = item['title']
data = item['data']
category = item['category']
tag = item['tag']
article = item['article']
votetotal = item['votetotal']
bookmark = item['bookmark']
comments = item['comments']
cursor.execute(
'insert into jobole(title,data,url,url_object_id,image_url,image_url_path,tag,category,article,votetotal,bookmark,comments) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' ,
(title , data , url , url_object_id , image_url , image_url_path , tag , category , article , votetotal ,
bookmark , comments))