欢迎来到python!学习。 这是我人生旅途的第一课,非常重要。 我从这个案例中学到的许多关于python爬虫的知识,这是我python爬虫的启蒙。
#所需要的库
requests:用于网页请求
BeautifulSoup:选择所要的元素
json:用来解析json数据的库
re:用于正则表达式的筛选
代码演示:
import json
import re
from hashlib import md5
import os
import requests
from urllib.parse import urlencode
from bs4 import BeautifulSoup
from config import *
from multiprocessing import Pool
#对要爬取页面的解析
def get_page_index(offset, keyword):
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': 3,
'from':'gallery'
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}
response = requests.get(url,headers=header)
if response.status_code == 200:
return response.text
return None
except:
print('请求索引失败')
return None
#获取所要的数据在json中
def prase_get_index(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
#用正则对json数据进行解析
def prase_page_urlli(html):
soup = BeautifulSoup(html, 'lxml')
title = soup.select('title')[0].get_text()
print(title)
images_pattern= re.compile('gallery: JSON.parse\((.*?)\)',re.S)
result=re.search(images_pattern,html)
if result:
data=json.loads(result.group(1))
data=eval(data)
if data and 'sub_images'in data.keys():
sub_images=data.get('sub_images')
image=[item.get('url').replace('\\','') for item in sub_images]
for images_page in image:
dowload_image(images_page)
return{
'title':title,
'image':image
}
#对所要下载的图片链接进行访问
def get_page_urlli(url):
try:
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}
response = requests.get(url,headers=header)
if response.status_code == 200:
return response.text
return None
except:
print('请求失败',url)
return None
#下载图片
def dowload_image(url):
print("正在下载",url)
try:
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}
response = requests.get(url,headers=header)
if response.status_code == 200:
save_image(response.content)
return None
except:
print('请求图片出错',url)
return None
#把图片保存到本地路径下
def save_image(content):
file_path='{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
f.close()
def main(offset):
html = get_page_index(offset, keyword)
for url in prase_get_index(html):
html=get_page_urlli(url)
if html:
result=prase_page_urlli(html)
print(result)
if __name__ == '__main__':
group=[x*20 for x in range(stat,end+1)]
pool=Pool()
pool.map(main,group)
详细的解释。