栏目分类:
子分类:
返回
文库吧用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
文库吧 > IT > 软件开发 > 后端开发 > Python

零组文库,想用的抓紧

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

零组文库,想用的抓紧

零组文库已经是关闭了,在网上找到了一个备份站点:主页 · 资料文库

http://book.iwonder.run/index.html

网页太多,就写了个脚本,但是,太多的请求访问,结果就访问不了了,脚本如下

#!C:Python3.7
# -*- coding:utf-8 -*-

import requests
from lxml import etree
import os
import re


def get_date(url):
    try:
        req =requests.get(url)
        req.encoding = "UTF-8"
        # print(req.text)
        html = etree.HTML(req.text)
        # 1 /html/body/div/div[1]/nav/ul/li[2]/span/text()
        #1_level directory
        directory_html = html.xpath("/html/body/div/div[1]/nav/ul/li")
        for directory in directory_html:
            directory_name = directory.xpath("./span/text()")
            if len(directory_name)>0:
                directory_name=directory_name[0].replace("n","").strip()[1:-1]
                # print(directory_name)
                mkdir(directory_name)#创建一级文件夹

                ##2_level directory
                sub_html = directory.xpath("./ul/li")
                for sub in sub_html:
                    # 2 /html/body/div/div[1]/nav/ul/li[2]/ul/li/span/text()
                    # 3/html/body/div/div[1]/nav/ul/li[6]/ul/li/ul/li/span/text()
                    sub_name = sub.xpath("./span/text()")
                    if len(sub_name) >0:
                        sub_name=sub_name[0].replace("n","").strip()
                        # print(sub_name)
                        mkdir(os.path.join(directory_name,sub_name))#创建二级文件夹
                        mkdir(os.path.join(directory_name,sub_name,'img'))#创建图片文件夹
                        # href /html/body/div/div[1]/nav/ul/li[2]/ul/li[1]/ul/li/a/@href
                        #3_page_url
                        href=sub.xpath("./ul/li/a/@href")
                        # print(href)
                    # exit()

                        if len(href)>0:
                            for u in href:
                                if "http" in u:
                                    continue
                                page_url=url+u
                                print("2",page_url)
                                data,file_name = page(page_url)#页面处理
                                # print(file_name)
                                file_name=file_name+".html"
                                save_file(os.path.join(directory_name,sub_name,file_name),data.encode())
                                #save_img
                                img_list = get_img_url(page_url,sub_name)
                                if len(img_list)>0:
                                    for img_url in img_list:
                                        save_img(os.path.join(directory_name,sub_name),img_url)

                        else:
                            # #3_level directory
                            print("3")
                            third_html= sub.xpath("./ul/li")
                            for third in third_html:
                                third_name = third.xpath("./span/text()")[0].replace("n","").strip()
                                # print("third:",third_directory)

                                if len(third_name)>0:

                                    mkdir(os.path.join(directory_name,sub_name,third_name))#创建三级目录
                                    mkdir(os.path.join(directory_name, sub_name,third_name, 'img'))  # 创建图片文件夹
                                    href = third.xpath("./ul/li/a/@href")
                                    # print(href)

                                    if len(href) > 0:
                                        for u in href:
                                            if "http" in u:
                                                continue
                                            page_url=url+u
                                            print("3",page_url)
                                            data,file_name = page(page_url)#页面处理
                                            # print(file_name)
                                            file_name = file_name + ".html"
                                            save_file(os.path.join(directory_name,sub_name,third_name,file_name),data.encode())
                                            #save_img
                                            img_list = get_img_url(page_url,sub_name)
                                            if len(img_list)>0:
                                                for img_url in img_list:
                                                    save_img(os.path.join(directory_name,sub_name,third_name),img_url)
        print("[****] finish!")
    except Exception as e:
        print(e)
        pass



def mkdir(name):
    dir = os.getcwd()
    dir=os.path.join(dir,"osec",name)
    # print(dir)
    if not os.path.exists(dir):
        os.mkdir(dir)
        print("[D*]mkdir ",dir,"success!")
    else:
        print("[D-]", dir, "exists!")


def save_file(name,data):
    dir = os.getcwd()
    dir = os.path.join(dir, "osec", name)
    if not os.path.exists(dir):
        with open(dir,"wb") as file:
            file.write(data)
            print("[f*] write ",dir," success!")
    else:
        print("[f-] write ", dir, " exists!")


def page(url):
    try:
        req = requests.get(url)
        req.encoding = 'UTF-8'
        # print(req.text)
        start = req.text.find('')
        # print(start,stop)
        rem = req.text[start:stop]
        # print(rem)
        data = req.text.replace(rem,"")
        # print(data)

        #filename
        html =etree.HTML(req.text)
        filename = html.xpath("/html/body/div/div[2]/div/div[2]/div/div/div[1]/section/h1/text()")#
        if len(filename)>0:
            filename=filename[0].strip()
        else:
            filename=url.rsplit("/",1)[-1].rsplit(".",1)[0]
        filename=str(filename).replace("<","").replace("=","").replace("/","_").replace("\","_")
        # print(data,filename)
        return data,filename
    except Exception as e:
        pass


def get_img_url(page_url,sub_name):
    base_url = "http://book.iwonder.run/0day/"
    req = requests.get(page_url)
    req.encoding = 'UTF-8'
    html = etree.HTML(req.text)
    img_list=[]
    img_list = html.xpath("//*[@id="book-search-results"]/div[1]/section/p/img/@src")
    if len(img_list)>0:
        for i in range(len(img_list)):
            img_list[i] = base_url+sub_name+"/"+img_list[i]
    # print(img_list)
    return img_list


def save_img(path,img_url):

    base_path = os.getcwd()
    base_path = os.path.join(base_path,"osec",path,"img")
    filename = str(img_url).rsplit("/",1)[-1]
    file_path = os.path.join(base_path,filename)
    # print(file_path)
    if not os.path.exists(file_path):
        try:
            file_data = requests.get(img_url).content
            with open(file_path,"wb") as file:
                file.write(file_data)
                print("[img**] save img ",file_path," success!")
        except:
            pass
    else:
        print("[img--] save img ", file_path, " exists!")




if __name__ == '__main__':

    url="http://book.iwonder.run/"
    page_url="http://book.iwonder.run/0day/74cms/%E6%96%B0%E7%89%8874cms%20v4.2.1-v4.2.129-%E5%90%8E%E5%8F%B0getshell%E6%BC%8F%E6%B4%9E.html"

    get_date(url)
    # mkdir("测试")
    # page("http://book.iwonder.run/0day/Coremail/1.html")
    # img(page_url,"test")
    # save_img(os.path.join("dirctory","subname"),)

转载请注明:文章转载自 www.wk8.com.cn
本文地址:https://www.wk8.com.cn/it/280342.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 wk8.com.cn

ICP备案号:晋ICP备2021003244-6号