一、所需要的库
1、scrapy
二、首先编写一个脚本,运行我们的spider
mport sys
import os
from scrapy.cmdline import execute
#获取系统文件路径
print(os.path.dirname(os.path.abspath(__file__)))
#将文件路加到系统环境下
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
#执行语句
execute(['scrapy','crawl','99spider'])
三、编写spider
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import jiankangItemLoader , Crawljiankang99Item
#利用crawlspider全站爬取
class A99spiderSpider(CrawlSpider):
#spider的名字
name = '99spider'
#spider所允许的域名,只能在这些域名下爬取
allowed_domains = ['www.99.com.cn',
'nv.99.com.cn',
'ye.99.com.cn',
'zyk.99.com.cn',
'jf.99.com.cn',
'fk.99.com.cn',
'zx.99.com.cn',
'bj.99.com.cn',
'nan.99.com.cn',
'nan.99.com.cn',
'jz.99.com.cn',
'gh.99.com.cn',
'news.99.com.cn']
deny_domains=[]
#spider的初始域名
start_urls = ['http://www.99.com.cn/']
#crawl spider的爬取规则
rules = (
#Rule(LinkExtractor(allow=r"http://.*.99.com.cn/"),follow=True),
#allow是允许爬取的页面,deny是当访问到这些页面是自动过滤,不爬取
Rule(LinkExtractor(allow=r'.*/\d+.htm',deny=(r'/jijiu/jjsp/\d+.htm',r'/jzzn/.*/\d+.htm',r'/ssbd/jfsp/\d+.htm'
,r'/zhongyiyangshengshipin/.*/.html',)), callback='parse_item', follow=True,),
)
#利用itemLoader进行对页面的分析
def parse_item(self, response):
image_url=response.css('.detail_con img::attr(src)').extract_first()
item_loader=jiankangItemLoader(item=Crawljiankang99Item(),response=response)
item_loader.add_css('title','.title_box h1::text')
item_loader.add_value('url',response.url)
item_loader.add_css('content','.detail_con')
#这里必须传入一个list列表,才能利用ImagesPipeline下载图片
item_loader.add_value('image_url',[image_url])
jiankang=item_loader.load_item()
return jiankang
四、编写item
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst , MapCompose
#remove掉conten的tag标签
from w3lib.html import remove_tags
#利用item_loader就必须自定义item
class jiankangItemLoader(ItemLoader):
#自定义item
default_output_processor = TakeFirst()
def return_value(value):
return value
class Crawljiankang99Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
url=scrapy.Field()
title=scrapy.Field()
#用ImagePipelines所要做的
image_url=scrapy.Field(
output_processor=MapCompose(return_value)
)
image_url_path=scrapy.Field()
content=scrapy.Field(
input_processor=MapCompose(remove_tags)
)
五、编写pipelines存储到mysql数据库中
import MySQLdb
from twisted.enterprise import adbapi
import MySQLdb.cursors
from scrapy.pipelines.images import ImagesPipeline
#image_url对应有一个path属性是下载的图片的地址,对其获取并存到数据库中
class jiankangImagePipeline(ImagesPipeline):
def item_completed(self, results, item, info):
if 'image_url' in item:
for ok,value in results:
image_file_path=value['path']
item['image_url_path']=image_file_path
return item
class Crawljiankang99Pipeline(object):
def process_item(self, item, spider):
return item
#使用twisted进行数据的插入
class MysqlTwistePipline(object):
def __init__(self,dbpool):
self.dbpool=dbpool
#静态获取settings文件的值
@classmethod
def from_settings(cls,settings):
dbparms=dict(
#数据库的地址
host=settings['MYSQL_HOST'],
#数据库的名字
db=settings['MYSQL_DB'] ,
#数据库的用户名
user=settings['MYSQL_USER'] ,
#数据库的名密码
passwd=settings['MYSQL_PASSWORD'] ,
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
#对数据进行异步操作
dbpool=adbapi.ConnectionPool('MySQLdb',**dbparms)
return cls(dbpool)
def process_item(self , item , spider):
#使用twisted将musql变成异步操作
query=self.dbpool.runInteraction(self.do_insert,item)
query.addErrback(self.hand_erro)
def hand_erro(self,failure):
print(failure)
#执行sql语句
def do_insert(self,cursor,item):
url = item['url']
image_urls = item['image_url']
image_url = image_urls[0]
content = item['content']
title = item['title']
image_url_path=item['image_url_path']
cursor.execute(
'insert into jiankang(url,title,image_url,image_url_path,content) values(%s,%s,%s,%s,%s)' ,
(url,title,image_url,image_url_path,content))
六、编写settings
#数据库的配置文件
MYSQL_HOST='127.0.0.1'
MYSQL_USER='root'
MYSQL_PASSWORD=''
MYSQL_DB='test'
import os
#执行pipelines
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline':1,
'Crawljiankang99.pipelines.MysqlTwistePipline': 4,
'Crawljiankang99.pipelines.jiankangImagePipeline':2,
}
#下载的图片链接
IMAGES_URLS_FIELD='image_url'
#获取当前文件的路径
object_url=os.path.abspath(os.path.dirname(__file__))
#对应生成名叫image文件的文件夹来存储图片
IMAGES_STORE=os.path.join(object_url,'image')