有关于简书页面爬取破解的两个爬虫编写时间20200104

JAY.LIN 收录于 Python

2023-11-02 约 1471 字预计阅读 3 分钟

https://bing.ee123.net/img/rand?artid=104037264

有关于简书页面爬取破解的两个爬虫（编写时间20200104）

由于课程设计需要编写了一系列爬虫，期中包括：

博客园页面+博客园搜索
百度搜索+百度文库
简书搜索+简书页面
爱学术搜索

具体目的就详细说明，反正就搜集一下资料和URL

下为简书的两个爬虫（编写时间20200104）

首先是简书页面爬虫，这个是基础爬虫，并没有什么反爬措施，只要频率别太高一般是可以用的

import requests
import time
from lxml import etree
import random
import os
import json

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
    }

def getContent(url):
    try:
        re = requests.get(url,headers = headers)
        re.encoding = re.apparent_encoding
        # print(re.text)
        html = etree.HTML(re.text)
        title = html.xpath("//*[@id='__next']/div[1]/div/div/section[1]/h1/text()")
        content = []
        for each in html.xpath("//*[@id='__next']/div[1]/div/div/section[1]/article/*"):
            con_temp = each.xpath("string(.)")
            if  con_temp:
                content.append(str(con_temp))
            con_temp = each.xpath(".//@data-original-src")

            if con_temp:
                for ImgIndex in range(len(con_temp)):
                    if ImgIndex>0 and con_temp[ImgIndex] is not con_temp[ImgIndex-1]:
                        content.append("![]({})".format(con_temp[ImgIndex]))
                    elif ImgIndex==0:
                        content.append("![]({})".format(con_temp[ImgIndex]))
                    # print(ImgIndex,con_temp[ImgIndex])
        return title,content
    except:
        return "",""
def save(title,content):
    with open("{}.md".format(title[0]),'w',encoding='utf-8') as f:
        f.write(str(title[0])+'\n\n')
        for each in content:
            f.write(each+'\n')


if __name__ == '__main__':
    url = "https://www.jianshu.com/p/280c6a6f2594"
    url = "https://www.jianshu.com/p/d75211cec9df"
    title,content = getContent(url)
    if title is not "":
            save(title,content)
            pass
    else:
        print("访问出错，请确认下url试试")

接下来是比较有意思的简书搜索爬虫——简书搜索爬虫
就是这个搜索框（由于这个爬虫是20年1月4号编写，可能存在一定的局限性）
没错、这个搜索功能的返回值动态的、似乎还有个密码
在请求头中有几样必不可少的东西，分别是：x-csrf-token、_m7e_session_core还有user-agent
user-agent好说，是不变的，但是前两个是动态生成的，是会变的，如何破解它的生成机制就是关键
仔细分析搜索简书分析流程发现简书的搜索分为两个步骤：
- 1.搜索 url = "https://www.jianshu.com/search?q={}&page=1&type=note".format(word)
- 2.搜索 url = "https://www.jianshu.com/search/do?q={}&type=note&page={}&order_by=default".format( word, page)
第一步search下的这个URL，会返回一个页面，具体长啥样忘了，这个页面里就直接存在期中一个值 head["x-csrf-token"] = str(html.xpath("//*[@name='csrf-token']/@content")[0]) ，就是这个x-csrf-token，我们可以通过xpath获得，另外一个值在cookie中 strCookie = "_m7e_session_core={};".format( re.cookies.get("_m7e_session_core")) 通过类似与字典取值的方法将这个cookie取出来，然后用这两个值二次构造我们的header请求头
第二步do下的这个URL是请求真正的数据的，也是此行的主要目标，用上二次构造好的header重新访问，即可拿到真正的数据，是Json的，接下来就是机械化地分析~数据处理等

PS 更新：文末会指出两个关键值的所在位置

import requests
import time
from lxml import etree
import random
import os
import json
import utils.Utils as sq

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}

proxy_list = [
    '117.90.131.247:8118',
    '171.11.32.77:9999',
    '223.199.31.112:9999',
    '27.191.234.69:9999',
    '223.199.31.5:9999',
    '60.167.135.179:9999'
]


# 获取必要的两个访问头x-csrf-token和_m7e_session_core
def getCookie(word, head, proxy):
    # 先通过必要的一次访问获取x-csrf-token值和cookie中_m7e_session_core的值，二次构造header
    url = "https://www.jianshu.com/search?q={}&page=1&type=note".format(word)
    re = requests.get(url=url, headers=head, timeout=30, proxies=proxy)
    html = etree.HTML(re.text)
    head["x-csrf-token"] = str(html.xpath("//*[@name='csrf-token']/@content")[0])
    strCookie = "_m7e_session_core={};".format(
        re.cookies.get("_m7e_session_core"))
    head["cookie"] = strCookie
    # print(head)
    return head


def getContent(word, head, proxy, page, content, total_pages):
    print("当前第{}页".format(page))
    try:
        # 用post请求访问真正数据
        url = "https://www.jianshu.com/search/do?q={}&type=note&page={}&order_by=default".format(
            word, page)
        re = requests.post(url=url, headers=head, timeout=30, proxies=proxy)

        # Json分析
        js = json.loads(re.text)
        # 从JSON中获取总页数
        try:
            if total_pages == 0:
                total_pages = js["total_pages"]
            data = js["entries"]
        except:
            print("total_pages or error")

        if data:
            for each in data:
                temp_title = each["title"].replace("\"", "").replace("\n", "").replace(")", " ").replace("(",
                                                                                                         " ").replace(
                    "\\", "")
                temp_content = each["content"].replace("\"", "").replace("\n", "").replace(")", " ").replace("(",
                                                                                                             " ").replace(
                    "\\", "")
                temp_url = "https://www.jianshu.com/p/{}".format(each["slug"])
                # print("{}\n{}\n{}\n\n".format(temp_title, temp_content, temp_url))
                content.append([temp_url, temp_title, temp_content])
        # 递归访问下一页
        if total_pages != 0 and total_pages > page and page < 15:
            time.sleep(1)
            return getContent(word=word, head=head, proxy=proxy, page=page + 1, content=content,
                              total_pages=total_pages)
    except NameError as  e:
        print(e, "访问出错")
    return content


def do(word):
    # 随机获取代理ip
    proxy = {"http": random.choice(proxy_list)}
    # print(proxy)
    head = getCookie(word=word, head=headers.copy(), proxy=proxy)
    content = getContent(word=word, head=head, proxy=proxy, page=1, content=[], total_pages=0)
    print(len(content))
    # for each in content:
    #     print("title={}\ncontent={}\nurl={}\n\n".format(each[0],each[1],each[2]))
    sq.insert_into_inital_data(content, "简书")


if __name__ == '__main__':
    word = "电路SAT"
    do(word)

总结

破解访问机制还是蛮有意思的，拿着自己的爬虫出来溜一圈才发现之前学的不过是皮毛罢了，简书这个还算比较简单的，CSDN这个我到现在还没完全破解，等我有时间了好好研究这个页面加密的问题是个什么情况。
刚刚回去翻了翻，似乎简书的搜索功能被关了？好像不太能用了~~估计是在更新？还是就挂了？等它好了再尝试给它破解看看~~

更新：

搜索功能可以用了，发现似乎没啥变化
下面是：x-csrf-token的位置
下面是： _m7e_session_core 的位置

目录

有关于简书页面爬取破解的两个爬虫编写时间20200104

有关于简书页面爬取破解的两个爬虫（编写时间20200104）

由于课程设计需要编写了一系列爬虫，期中包括：

具体目的就详细说明，反正就搜集一下资料和URL

下为简书的两个爬虫（编写时间20200104）

PS 更新：文末会指出两个关键值的所在位置

总结

更新：