零组文库已经是关闭了,在网上找到了一个备份站点:主页 · 资料文库
http://book.iwonder.run/index.html
网页太多,就写了个脚本,但是,太多的请求访问,结果就访问不了了,脚本如下
#!C:Python3.7 # -*- coding:utf-8 -*- import requests from lxml import etree import os import re def get_date(url): try: req =requests.get(url) req.encoding = "UTF-8" # print(req.text) html = etree.HTML(req.text) # 1 /html/body/div/div[1]/nav/ul/li[2]/span/text() #1_level directory directory_html = html.xpath("/html/body/div/div[1]/nav/ul/li") for directory in directory_html: directory_name = directory.xpath("./span/text()") if len(directory_name)>0: directory_name=directory_name[0].replace("n","").strip()[1:-1] # print(directory_name) mkdir(directory_name)#创建一级文件夹 ##2_level directory sub_html = directory.xpath("./ul/li") for sub in sub_html: # 2 /html/body/div/div[1]/nav/ul/li[2]/ul/li/span/text() # 3/html/body/div/div[1]/nav/ul/li[6]/ul/li/ul/li/span/text() sub_name = sub.xpath("./span/text()") if len(sub_name) >0: sub_name=sub_name[0].replace("n","").strip() # print(sub_name) mkdir(os.path.join(directory_name,sub_name))#创建二级文件夹 mkdir(os.path.join(directory_name,sub_name,'img'))#创建图片文件夹 # href /html/body/div/div[1]/nav/ul/li[2]/ul/li[1]/ul/li/a/@href #3_page_url href=sub.xpath("./ul/li/a/@href") # print(href) # exit() if len(href)>0: for u in href: if "http" in u: continue page_url=url+u print("2",page_url) data,file_name = page(page_url)#页面处理 # print(file_name) file_name=file_name+".html" save_file(os.path.join(directory_name,sub_name,file_name),data.encode()) #save_img img_list = get_img_url(page_url,sub_name) if len(img_list)>0: for img_url in img_list: save_img(os.path.join(directory_name,sub_name),img_url) else: # #3_level directory print("3") third_html= sub.xpath("./ul/li") for third in third_html: third_name = third.xpath("./span/text()")[0].replace("n","").strip() # print("third:",third_directory) if len(third_name)>0: mkdir(os.path.join(directory_name,sub_name,third_name))#创建三级目录 mkdir(os.path.join(directory_name, sub_name,third_name, 'img')) # 创建图片文件夹 href = third.xpath("./ul/li/a/@href") # print(href) if len(href) > 0: for u in href: if "http" in u: continue page_url=url+u print("3",page_url) data,file_name = page(page_url)#页面处理 # print(file_name) file_name = file_name + ".html" save_file(os.path.join(directory_name,sub_name,third_name,file_name),data.encode()) #save_img img_list = get_img_url(page_url,sub_name) if len(img_list)>0: for img_url in img_list: save_img(os.path.join(directory_name,sub_name,third_name),img_url) print("[****] finish!") except Exception as e: print(e) pass def mkdir(name): dir = os.getcwd() dir=os.path.join(dir,"osec",name) # print(dir) if not os.path.exists(dir): os.mkdir(dir) print("[D*]mkdir ",dir,"success!") else: print("[D-]", dir, "exists!") def save_file(name,data): dir = os.getcwd() dir = os.path.join(dir, "osec", name) if not os.path.exists(dir): with open(dir,"wb") as file: file.write(data) print("[f*] write ",dir," success!") else: print("[f-] write ", dir, " exists!") def page(url): try: req = requests.get(url) req.encoding = 'UTF-8' # print(req.text) start = req.text.find('') # print(start,stop) rem = req.text[start:stop] # print(rem) data = req.text.replace(rem,"") # print(data) #filename html =etree.HTML(req.text) filename = html.xpath("/html/body/div/div[2]/div/div[2]/div/div/div[1]/section/h1/text()")# if len(filename)>0: filename=filename[0].strip() else: filename=url.rsplit("/",1)[-1].rsplit(".",1)[0] filename=str(filename).replace("<","").replace("=","").replace("/","_").replace("\","_") # print(data,filename) return data,filename except Exception as e: pass def get_img_url(page_url,sub_name): base_url = "http://book.iwonder.run/0day/" req = requests.get(page_url) req.encoding = 'UTF-8' html = etree.HTML(req.text) img_list=[] img_list = html.xpath("//*[@id="book-search-results"]/div[1]/section/p/img/@src") if len(img_list)>0: for i in range(len(img_list)): img_list[i] = base_url+sub_name+"/"+img_list[i] # print(img_list) return img_list def save_img(path,img_url): base_path = os.getcwd() base_path = os.path.join(base_path,"osec",path,"img") filename = str(img_url).rsplit("/",1)[-1] file_path = os.path.join(base_path,filename) # print(file_path) if not os.path.exists(file_path): try: file_data = requests.get(img_url).content with open(file_path,"wb") as file: file.write(file_data) print("[img**] save img ",file_path," success!") except: pass else: print("[img--] save img ", file_path, " exists!") if __name__ == '__main__': url="http://book.iwonder.run/" page_url="http://book.iwonder.run/0day/74cms/%E6%96%B0%E7%89%8874cms%20v4.2.1-v4.2.129-%E5%90%8E%E5%8F%B0getshell%E6%BC%8F%E6%B4%9E.html" get_date(url) # mkdir("测试") # page("http://book.iwonder.run/0day/Coremail/1.html") # img(page_url,"test") # save_img(os.path.join("dirctory","subname"),)