某一天突然想要整理一份Anmi的作品集,于是打开了yande.re,看着整整七页的图,我皱起了眉头;但是这并没有难倒我,我成功使用百度找到了一个可以通过tag来下载Y站图片的py程序……这是几天前的事情了,所以我现在一时半会找不到原文的地址了……作者非常神秘,直接贴了这段python代码,但是这代码根本过不了编译(话说python那个能叫编译么?还是叫解释器检查?)……然后我自己对着错误信息改了改才让程序能跑起来……
顺带一提,似乎这个程序需要python3才能运行,服务器用户尤其是使用Centos的,注意系统自带的2.7.5版本python,要注意安装python3并且处理好yum等程序的运行问题;
2020.03.12 已重写
#!/usr/bin/env python3
#coding:utf-8
# request bs4 html5lib
import os
import re
import urllib
import shutil
import requests
from bs4 import BeautifulSoup
__PROXIES = {'http': 'http://127.0.0.1:10086', 'https': 'http://127.0.0.1:10086'}
__USER_AGENT = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.66")
def download(url: str, prefix: str, client: requests.Session = requests.Session()):
if not os.path.isdir(prefix):
os.mkdir(prefix)
filename = urllib.parse.unquote(url.split("/")[-1])
print("Downloading:", filename)
with client.get(url, stream=True) as resp:
resp.raise_for_status()
with open(prefix+filename, "wb") as fwriter:
for chunk in resp.iter_content(chunk_size=8192):
if chunk: # filter out keep-alive new chunks
fwriter.write(chunk)
def pic_info(url: str, client: requests.Session = requests.Session()) -> None:
info_page = BeautifulSoup(client.get(url).text, "html5lib")
pool = info_page.find(name="div", attrs={"class": "status-notice", "id": re.compile("pool")})
if pool:
try:
category = pool.find(name="a", attrs={"href": re.compile("/pool/show")}).string
except (TypeError, AttributeError):
category = "Default Category"
else:
try:
crinfo = info_page.find(name="li", attrs={"class": "tag-type-copyright"})
category = crinfo.find_all(name="a")[-1].string
except (TypeError, AttributeError):
category = "Default Category"
original_file = info_page.find(name="a", attrs={"class": "original-file-unchanged"})
if not original_file:
original_file = info_page.find(name="a", attrs={"class": "original-file-changed"})
download(original_file["href"], "cache/"+category+"/", client)
def find_in_page(tag: str, page: int, client: requests.Session = requests.Session()) -> None:
url = "https://yande.re/post?page=" + str(page) + "&tags=" + tag
psoup = BeautifulSoup(client.get(url).text, "html5lib")
posts = psoup.find(name='ul', attrs={"id": "post-list-posts"})
for content in posts.contents:
try:
post = content.find(name="a", attrs={"class": "thumb"})
except TypeError:
continue
pic_info("https://yande.re"+post["href"], client)
if __name__ == "__main__":
try:
shutil.rmtree('cache', ignore_errors=True)
finally:
os.mkdir("cache")
CLIENT = requests.Session()
CLIENT.headers.update({'User-Agent': __USER_AGENT})
CLIENT.proxies = __PROXIES
for n in range(1, 14):
flag = True
while flag:
try:
find_in_page("anmi", n, CLIENT)
flag = False
except requests.exceptions.RequestException as reqerr:
print(reqerr)
# find_in_page("anmi", 6, CLIENT)
One comment
我们不生产代码,只是代码的搬运工