利用scrapy爬取伯乐在线的所有文章

一、首先我们所需要的库

1、resquests
2、re

二、创建一个scrapy

scrapy startproject ArticleSpider

scrapy genspider jobbole www.jobbole.com

三、爬取所有文章列表的url以及爬取下一页

#所要爬取链接的起始url
start_urls = ['http://blog.jobbole.com/all-posts/']

def parse(self, response):
    #获取第一页下的所有a标签
    url_list=response.css('#archive .floated-thumb .post-thumb a')
    for url in url_list:
        #获取img标签下的src属性，图片的链接dizhi
        url_img=url.css('img::attr(src)').extract_first()
        #获取每一个url_list的详情页面的url
        urls=url.xpath('@href')[0].extract()
        #通过parse.urljoin传递一个绝对地址
        #通过meta属性向item中添加font_image_url属性
        yield scrapy.Request(parse.urljoin(response.url,urls),meta={'font_image_url':url_img},callback=self.get_parse)
    #获取下一页的链接地址、
    next=response.css('.next.page-numbers::attr(href)')[0].extract()
    #一直循环，知道没有下一页为止
    if next:
        #回调parse函数
        yield scrapy.Request(parse.urljoin(response.url,next),callback=self.parse)
    else:
        return None

四、对详情文章页面进行分析

我们要的数据是文章的

[标题，日期，标签，文章，点赞数，收藏数，评论数]

#标题
title= response.css('.grid-8 .entry-header h1::text')[0].extract()
#日期
data=response.css('.grid-8 .entry-meta p::text')[0].extract().strip().replace('·','').strip()
#标签
tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
tag_list = [element for element in tag_list if not element.strip().endswith("评论")]#删除标签内的评论数
tags = ",".join(tag_list)#把标签数组以','链接生成字符串
#文章
article= response.css('.grid-8 .entry')[0].extract()
#点赞数
votetotal=response.css('.post-adds h10::text')[0].extract()
match_re = re.match('.*(\d+).*', votetotal)#用正则筛选出我们所需要的数字
if match_re:
    votetotal=int(match_re.group(1))#返回第一个
else:
    votetotal=0#如国没有则默认为0
#收藏数
bookmark=response.css('.post-adds .bookmark-btn::text')[0].extract()
match_re = re.match('.*(\d+).*', bookmark)
  if match_re:
    bookmark=int(match_re.group(1))
else:
    bookmark=0
#评论数
comments=response.css('.post-adds a .hide-on-480::text')[0].extract()
match_re = re.match('.*(\d+).*', comments)
if match_re:
 comments=int(match_re.group(1))
else:
 comments=0

get_parse方法来获取详情页的数据

def get_parse(self,response):
    #接收传过来的 font_iamgge_url
    image_url=response.meta.get('font_image_url','')
    title= response.css('.grid-8 .entry-header h1::text')[0].extract()
    data=response.css('.grid-8 .entry-meta p::text')[0].extract().strip().replace('·','').strip()
    category=response.css('.grid-8 .entry-meta p a::text')[0].extract()
    tag=response.css('.grid-8 .entry-meta p a::text')[-1].extract().strip().replace('·','').strip()
    article= response.css('.grid-8 .entry')[0].extract()
    votetotal=response.css('.post-adds h10::text')[0].extract()
    match_re = re.match('.*(\d+).*', votetotal)
    if match_re:
        votetotal=int(match_re.group(1))
    else:
        votetotal=0
    bookmark=response.css('.post-adds .bookmark-btn::text')[0].extract()
    match_re = re.match('.*(\d+).*', bookmark)
    if match_re:
        bookmark=int(match_re.group(1))
    else:
        bookmark=0
    comments=response.css('.post-adds a .hide-on-480::text')[0].extract()
    match_re = re.match('.*(\d+).*', comments)
    if match_re:
        comments=int(match_re.group(1))
    else:
        comments=0
    #对item对象实例化
    item=ArticlespiderItem()
    item['url']=response.url
    #调用md5把url压缩为固定的哈希值
    item['url_object_id'] = get_md5(response.url)
    item['image_url']=[image_url]
    #调用dtaetime库把字符串转化为date属性
    try:
        data=datetime.datetime.strftime(data,"%Y/%m/%d").date()
    except Exception as e:
        #如果有异常就获取当前系统的时间
        data=datetime.datetime.now().date()
    item['data']=data
    item['title'] = title
    item['category'] = category
    item['tag'] = tag
    item['article'] = article
    item['votetotal'] = votetotal
    item['bookmark']=bookmark
    item['comments'] = comments

五、下载每一篇文章的图片

在settings.py文件中加入
#获取item中iamge_url的图片链接
IMAGES_URLS_FIELD='image_url'
#获取当前的文件路径
object_url=os.path.abspath(os.path.dirname(__file__))
#创建image文件夹来存储图片
IMAGES_STORE=os.path.join(object_url,'image')

在image文件下会自动生成图片的名字，在ImagesPipeline中我们会找到path变量，我们可以找到每个url所对应的图片的名字，把它存到item中

class ArticleImagePipeline(ImagesPipeline):

def item_completed(self, results, item, info):
    if 'image_url' in item:
        for ok,value in results:
            image_file_path=value['path']
        item['image_url_path']=image_file_path
    return item

六、随着以后爬取速度加快，存的速度赶不上爬的速度，导致存的堆积影响性能，所以使用twisted将musql变成异步操作

from twisted.enterprise import adbapi

class MysqlTwistePipline(object):
def __init__(self,dbpool):
    self.dbpool=dbpool
@classmethod
def from_settings(cls,settings):
    dbparms=dict(
        host=settings['MYSQL_HOST'],
        db=settings['MYSQL_DB'] ,
        user=settings['MYSQL_USER'] ,
        passwd=settings['MYSQL_PASSWORD'] ,
        charset='utf8',
        cursorclass=MySQLdb.cursors.DictCursor,
        use_unicode=True,
    )
    dbpool=adbapi.ConnectionPool('MySQLdb',**dbparms)
    return cls(dbpool)

def process_item(self , item , spider):
    #使用twisted将musql变成异步操作
    query=self.dbpool.runInteraction(self.do_insert,item)
    query.addErrback(self.hand_erro)
def hand_erro(self,failure):
    print(failure)
def do_insert(self,cursor,item):
    url = item['url']
    url_object_id = item['url_object_id']
    image_urls = item['image_url']
    image_url = image_urls[0]
    image_url_path = item['image_url_path']
    title = item['title']
    data = item['data']
    category = item['category']
    tag = item['tag']
    article = item['article']
    votetotal = item['votetotal']
    bookmark = item['bookmark']
    comments = item['comments']
    cursor.execute(
        'insert into jobole(title,data,url,url_object_id,image_url,image_url_path,tag,category,article,votetotal,bookmark,comments) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' ,
        (title , data , url , url_object_id , image_url , image_url_path , tag , category , article , votetotal ,
         bookmark , comments))