''' 爬取图片,并且下载图片 url = 'https://pic.netbian.com/4kmeinv/' 爬取网页:requests 解析网页:beautifulsoup url = 'https://pic.netbian.com/4kmeinv/' url = 'https://pic.netbian.com/4kmeinv/index_2.html' "https://pic.netbian.com/uploads/allimg/220809/101035-16600110352f43.jpg" ''' import os import requests from bs4 import BeautifulSoup # 获取网页的源代码 def craw_html(url): resp = requests.get(url) resp.encoding = 'gbk' # ISO-8859-1 print(resp.status_code) # 200 页面没有做任何反扒措施 html = resp.text # print(html) return html # 解析图片的地址 def parse_and_download(html): soup = BeautifulSoup(html,'html.parser') imgs = soup.find_all('img') for img in imgs: src = img.get('src') # 或者:img['src] if "/uploads/" not in src: continue '''图片后缀没有添加域名,需要拼接一下''' src = f"https://pic.netbian.com{src}" # 首先得到图片的本地文件地址 filename = os.path.basename(src) # 当我们从网上下文件、图片的时候,都用wb二进制形式 with open(f"./美女图片/{filename}",'wb') as f: resp_img = requests.get(src) print(src) f.write(resp_img.content) if __name__ == '__main__': ''' 连接符号''' urls = [ 'https://pic.netbian.com/4kmeinv/'] + [f'https://pic.netbian.com/4kmeinv/index_{i}.html' for i in range(2, 11) ] for url in urls: print("#### 正在爬取:",url) html = craw_html(url) parse_and_download(html)