对百度糯米、美团、大众点评的数据进行爬取并保存到 mongodb 数据库和mysql数据库

一、首先准备我们需要的库

1、requests//用来请求网页

2、json//用来解析json数据,用于把json数据转换为字典

3、re//利用正则对字符串查找

3、pyquery//利用css查找

4、pymongo//存取pymongo

5、MySQLdb//存取mysql

创建一个配置文件config.py用于存储数据库的密码

MYSQL_HOST='localhost'
MYSQL_USER='root'
MYSQL_PASSWORD=''
MYSQL_DB='test'
MOGO_URL='localhost'
MOGO_DB='nuomi'
MOGO_TABLE='product'
MOGO_DB_M='meituan'
MOGO_TABLE_M='product'
MOGO_DB_D='dazhong'
MOGO_TABLE_D='product'

二、百度糯米

1、获取全国的url

def get_city():
    url='https://www.nuomi.com/pcindex/main/changecity'
    response=requests.get(url)
    response.encoding='utf-8'
    doc=pq(response.text)
    items=doc('.city-list .cities li').items()
    for item in items:
        product={
            'city':item.find('a').text(),
            'url':'https:'+item.find('a').attr('href')
        }
        get_pase(product['url'],keyword)

2、通过关键字搜索商品

def get_pase(url,keyword):
    head={
        'k':keyword,
    }
    urls=url+'/search?'+urlencode(head)
    response=requests.get(urls)
    response.encoding = 'utf-8'
    req=re.findall('noresult-tip',response.text)
    if req:
        print('抱歉,没有找到你搜索的内容')
    else:
        req=r'<a href="(.*?)" target="_blank"><img src=".*?" class="shop-infoo-list-item-img" /></a>'
        url_req=re.findall(req,response.text)
        for i in url_req:
            url_pase='https:'+i
            get_pase_url(url_pase)
        req=r'<a href="(.*?)" .*? class="ui-pager-normal" .*?</a>'
        url_next=re.findall(req,response.text)
        for i in url_next:
            url_pases=url+i
            get_pase_url(url_pases)

3、获取商品页的商品信息

def get_pase_url(url):
    response=requests.get(url)
    response.encoding = 'utf-8'
    doc=pq(response.text)
    product={
        'title':doc('.shop-box .shop-title').text(),
        'score':doc('body > div.main-container > div.shop-box > p > span.score').text(),
        'price':doc('.shop-info .price').text(),
        'location':doc('.item .detail-shop-address').text(),
        'phone':doc('body > div.main-container > div.shop-box > ul > li:nth-child(2) > p').text(),
        'time':doc('body > div.main-container > div.shop-box > ul > li:nth-child(3) > p').text(),
        'tuijian':doc('body > div.main-container > div.shop-box > ul > li:nth-child(4) > p').text()
    }
    print(product)
    save_mysql(product)
    #save_mongodb(product)

4、保存到数据库中

def save_mysql(product):
    conn=MySQLdb.connect(MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,charset='utf8')
    cursor = conn.cursor()
    cursor.execute("insert into nuomi(title,score,price,location,phone,time,tuijian) values('{}','{}','{}','{}','{}','{}','{}')".format(product['title'] , product['score'] , product['price'] , product['location'] , product['phone'] ,product['time'] , product['tuijian']))
    print('成功存入数据库',product)
def save_mongodb(result):
    client=pymongo.MongoClient(MOGO_URL)
    db=client[MOGO_DB]
    try:
        if db[MOGO_TABLE].insert(result):
            print('保存成功',result)
    except Exception:
        print('保存失败',result)

image

二、美团

1、获取全国的url

def get_city():
    url='http://www.meituan.com/changecity/'
    response=requests.get(url)
    response.encoding='utf-8'
    doc=pq(response.text)
    items=doc('.city-area .cities .city').items()
    for item in items:
        product={
            'url':'http:'+item.attr('href'),
            'city':item.text()
        }
        get_url_number(product['url'])

2、通过关键字搜索商品

def get_url_number(url):
    try:
        response=requests.get(url)
        req=r'{"currentCity":{"id":(.*?),"name":".*?","pinyin":'
        number_url=re.findall(req,response.text)
        for code in range(0,500,32):
            url='http://apimobile.meituan.com/group/v4/poi/pcsearch/{}?limit=32&offset={}&q={}'.format(number_url[0],code,keyword)
            response=requests.get(url)
            data=json.loads(response.text)
            imageUrl=data['data']['searchResult'][0]['imageUrl']
            address=data['data']['searchResult'][0]['address']
            lowestprice=data['data']['searchResult'][0]['lowestprice']
            title=data['data']['searchResult'][0]['title']
            url_id=data['data']['searchResult'][0]['id']
            product={
                'url_id':url_id,
                'imageUrl':imageUrl,
                'address':address,
                'lowestprice':lowestprice,
                'title':title
            }
            save_mysql(product)
    except Exception:
        return None

3、保存到数据库中

def save_mysql(product):
    conn=MySQLdb.connect(MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,charset='utf8')
    cursor = conn.cursor()
    cursor.execute("insert into meituan(url_id,imageUrl,address,lowestprice,title) values('{}','{}','{}','{}','{}')".format(product['url_id'], product['imageUrl'], product['address'], product['lowestprice'], product['title']))
    print('成功存入数据库',product)
def save_mongodb(result):
    client=pymongo.MongoClient(MOGO_URL)
    db=client[MOGO_DB_M]
    try:
        if db[MOGO_TABLE_M].insert(result):
            print('保存成功',result)
    except Exception:
        print('保存失败',result)

image

三、大众点评

1、获取全国的url

def get_url_city_id():
    url = 'https://www.dianping.com/ajax/citylist/getAllDomesticCity'
    headers = {
        'User-Agent': 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/64.0.3282.186Safari/537.36' ,

    }
    response = requests.get(url , headers=headers)
    data=json.loads(response.text)
    for i in range(1,35):
        url_data=data['cityMap'][str(i)]
        for item in url_data:
            product={
                'cityName':item['cityName'],
                'cityId':item['cityId'],
                'cityEnName':item['cityEnName']
            }
            get_url_keyword(product)
            break

2、通过关键字搜索商品

def get_url_keyword(product):
    urls = 'https://www.dianping.com/search/keyword/{}/0_%{}'.format(product['cityId'], keyword)
    headers = {
        'User-Agent': 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/64.0.3282.186Safari/537.36' ,

    }
    response = requests.get(urls, headers=headers)
    req=r'data-hippo-type="shop" title=".*?" target="_blank" href="(.*?)"'
    data=re.findall(req,response.text)
    for url in data:
        get_url_data(url)

3、获取商品页的商品信息

def get_url_data(url):
    headers= {
        'User-Agent': 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/64.0.3282.186Safari/537.36' ,
        'Host': 'www.dianping.com',
        'Pragma': 'no - cache',
        'Upgrade - Insecure - Requests': '1'
    }
    response = requests.get(url,headers=headers)
    doc=pq(response.text)
    title=doc('#basic-info > h1').text().replace('\n','').replace('\xa0','')
    avgPriceTitle=doc('#avgPriceTitle').text()
    taste=doc('#comment_score > span:nth-of-type(1)').text()
    Environmental=doc('#comment_score > span:nth-of-type(2)').text()
    service=doc('#comment_score > span:nth-of-type(3)').text()
    street_address=doc('#basic-info > div.expand-info.address > span.item').text()
    tel=doc('#basic-info > p > span.item').text()
    info_name=doc('#basic-info > div.promosearch-wrapper > p > span').text()
    time=doc('#basic-info > div.other.J-other > p:nth-of-type(1) > span.item').text()
    product={
        'title':title,
        'avgPriceTitle':avgPriceTitle,
        'taste': taste ,
        'Environmental':Environmental,
        'service': service ,
        'street_address':street_address,
        'tel': tel ,
        'info_name':info_name,
        'time':time
    }
    save_mysql(product)

3、保存到数据库中

def save_mysql(product):
    conn=MySQLdb.connect(MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,charset='utf8')
    cursor=conn.cursor()
    cursor.execute("insert into dazhong(title,avgPriceTitle,taste,Environmental,service,street_address,tel,info_name,time) values('{}','{}','{}','{}','{}','{}','{}','{}','{}')".format(product['title'],product['avgPriceTitle'],product['taste'],product['Environmental'],product['service'],product['street_address'],product['tel'],product['info_name'],product['time']))
    print('成功存入数据库' , product)
def save_mogodb(product):
    client=pymongo.MongoClient(MOGO_URL)
    db=client[MOGO_DB_D]
    try:
        if db[MOGO_TABLE_D].insert(product):
            print('保存成功',product)
    except Exception:
        print('保存失败',product)

详细的描述可以访问我的Github TomorrowLi 里面有我的源码

-------------本文结束感谢您的阅读-------------
坚持原创技术分享,您的支持将鼓励我继续创作!