yande.re爬虫程序（爬tag）

September 15, 2018

8450 views

3404 words

某一天突然想要整理一份Anmi的作品集，于是打开了yande.re，看着整整七页的图，我皱起了眉头；但是这并没有难倒我，我成功使用百度找到了一个可以通过tag来下载Y站图片的py程序……这是几天前的事情了，所以我现在一时半会找不到原文的地址了……作者非常神秘，直接贴了这段python代码，但是这代码根本过不了编译（话说python那个能叫编译么？还是叫解释器检查？）……然后我自己对着错误信息改了改才让程序能跑起来……

顺带一提，似乎这个程序需要python3才能运行，服务器用户尤其是使用Centos的，注意系统自带的2.7.5版本python，要注意安装python3并且处理好yum等程序的运行问题；

2020.03.12 已重写

#!/usr/bin/env python3
#coding:utf-8
# request bs4 html5lib
import os
import re
import urllib
import shutil
import requests

from bs4 import BeautifulSoup

__PROXIES = {'http': 'http://127.0.0.1:10086', 'https': 'http://127.0.0.1:10086'}
__USER_AGENT = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                "(KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.66")

def download(url: str, prefix: str, client: requests.Session = requests.Session()):
    if not os.path.isdir(prefix):
        os.mkdir(prefix)

    filename = urllib.parse.unquote(url.split("/")[-1])
    print("Downloading:", filename)
    with client.get(url, stream=True) as resp:
        resp.raise_for_status()
        with open(prefix+filename, "wb") as fwriter:
            for chunk in resp.iter_content(chunk_size=8192):
                if chunk: # filter out keep-alive new chunks
                    fwriter.write(chunk)

def pic_info(url: str, client: requests.Session = requests.Session()) -> None:
    info_page = BeautifulSoup(client.get(url).text, "html5lib")
    pool = info_page.find(name="div", attrs={"class": "status-notice", "id": re.compile("pool")})

    if pool:
        try:
            category = pool.find(name="a", attrs={"href": re.compile("/pool/show")}).string
        except (TypeError, AttributeError):
            category = "Default Category"
    else:
        try:
            crinfo = info_page.find(name="li", attrs={"class": "tag-type-copyright"})
            category = crinfo.find_all(name="a")[-1].string
        except (TypeError, AttributeError):
            category = "Default Category"

    original_file = info_page.find(name="a", attrs={"class": "original-file-unchanged"})
    if not original_file:
        original_file = info_page.find(name="a", attrs={"class": "original-file-changed"})

    download(original_file["href"], "cache/"+category+"/", client)

def find_in_page(tag: str, page: int, client: requests.Session = requests.Session()) -> None:
    url = "https://yande.re/post?page=" + str(page) + "&tags=" + tag

    psoup = BeautifulSoup(client.get(url).text, "html5lib")
    posts = psoup.find(name='ul', attrs={"id": "post-list-posts"})

    for content in posts.contents:
        try:
            post = content.find(name="a", attrs={"class": "thumb"})
        except TypeError:
            continue

        pic_info("https://yande.re"+post["href"], client)

if __name__ == "__main__":
    try:
        shutil.rmtree('cache', ignore_errors=True)
    finally:
        os.mkdir("cache")

    CLIENT = requests.Session()
    CLIENT.headers.update({'User-Agent': __USER_AGENT})
    CLIENT.proxies = __PROXIES

    for n in range(1, 14):
        flag = True
        while flag:
            try:
                find_in_page("anmi", n, CLIENT)
                flag = False
            except requests.exceptions.RequestException as reqerr:
                print(reqerr)
    # find_in_page("anmi", 6, CLIENT)

yande.re爬虫程序（爬tag）