最新的模拟知乎登陆

首先我们知道随着知乎页面的不断改版，以前的模拟登陆以不能用了，以下是对知乎改版之后的最新登陆方法

一、首先我们所需要的库

import requests
import time
import re
#用于下载验证码图片
import base64
#通过 Hmac 算法计算返回签名。实际是几个固定字符串加时间戳
import hmac
import hashlib
import json
import matplotlib.pyplot as plt
#保存cookie
from http import cookiejar
#打开图片
from PIL import Image

二、所需要的头信息

#所需要的头部信息
HEADERS = {
'Connection': 'keep-alive',
'Host': 'www.zhihu.com',
'Referer': 'https://www.zhihu.com/',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 '
              '(KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'
}
#登陆使的url
LOGIN_URL = 'https://www.zhihu.com/signup'
LOGIN_API = 'https://www.zhihu.com/api/v3/oauth/sign_in'
FORM_DATA = {
    #客户端id基本不会改变
    'client_id': 'c3cef7c66a1843f8b3a9e6a1e3160e20',
    'grant_type': 'password',
    'source': 'com.zhihu.web',
    'username': '用户名',
    'password': '密码',
    # 改为'cn'是倒立汉字验证码
    'lang': 'en',
    'ref_source': 'homepage'
}

要想登陆成功，header里必须还要俩个参数

#经过大量的验证，这个参数必须有，这个值基本不变
'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
#X-Xsrftoken则是防 Xsrf 跨站的 Token 认证，在Response Headers的Set-Cookie字段中可以找到。所以我们需要先请求一次登录页面，然后用正则把这一段匹配出来。注意需要无 Cookies 请求才会返回 Set-Cookie
'X-Xsrftoken': _xsrf

Set-Cookie

要想登陆成功，form_data也里必须还要俩个参数

'captcha': 验证码,
'timestamp': 时间戳,
'signature': 是通过 Hmac 算法对几个固定值和时间戳进行加密

timestamp 时间戳，这个很好解决，区别是这里是13位整数，Python 生成的整数部分只有10位，需要额外乘以1000

timestamp = str(int(time.time()*1000))

captcha 验证码，是通过 GET 请求单独的 API 接口返回是否需要验证码（无论是否需要，都要请求一次），如果是 True 则需要再次 PUT 请求获取图片的 base64 编码。

 def _get_captcha(self, headers):
    """
    请求验证码的 API 接口，无论是否需要验证码都需要请求一次
    如果需要验证码会返回图片的 base64 编码
    根据头部 lang 字段匹配验证码，需要人工输入
    :param headers: 带授权信息的请求头部
    :return: 验证码的 POST 参数
    """
    lang = headers.get('lang', 'en')
    if lang == 'cn':
        api = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=cn'
    else:
        api = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=en'
    resp = self.session.get(api, headers=headers)
    show_captcha = re.search(r'true', resp.text)
    if show_captcha:
        put_resp = self.session.put(api, headers=headers)
        img_base64 = re.findall(
            r'"img_base64":"(.+)"', put_resp.text, re.S)[0].replace(r'\n', '')
        with open('./captcha.jpg', 'wb') as f:
            f.write(base64.b64decode(img_base64))
        img = Image.open('./captcha.jpg')
        if lang == 'cn':
            plt.imshow(img)
            print('点击所有倒立的汉字，按回车提交')
            points = plt.ginput(7)
            capt = json.dumps({'img_size': [200, 44],
                               'input_points': [[i[0]/2, i[1]/2] for i in points]})
        else:
            img.show()
            capt = input('请输入图片里的验证码：')
        # 这里必须先把参数 POST 验证码接口
        self.session.post(api, data={'input_text': capt}, headers=headers)
        return capt
    return ''

signature 通过 Crtl+Shift+F 搜索找到是在一个 JS 里生成的，是通过 Hmac 算法对几个固定值和时间戳进行加密，那么只需要在 Python 里也模拟一次这个加密即可。

def _get_signature(self, timestamp):
    """
    通过 Hmac 算法计算返回签名
    实际是几个固定字符串加时间戳
    :param timestamp: 时间戳
    :return: 签名
    """
    ha = hmac.new(b'd1b964811afb40118a12068ff74a12f4', digestmod=hashlib.sha1)
    grant_type = self.login_data['grant_type']
    client_id = self.login_data['client_id']
    source = self.login_data['source']
    ha.update(bytes((grant_type + client_id + source + timestamp), 'utf-8'))
    return ha.hexdigest()

文章出自知乎