
import re
import os
import requests
global i
i = 0
def get_one_page(url):
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
return html
def get_urls(html):
pattern = re.compile('href="(/touxiang/qinglv/20\d+/\d+\.html)"', re.S)
urls = re.findall(pattern, html)
return urls
def get_pic_url(html):
pattern = re.compile('href="(//img\d\.woyaogexing\.com/20\d\d.*?\.jpeg)"', re.S)
pic_urls = re.findall(pattern, html)
return pic_urls
def save_pic(url, pic_path):
global i
if not os.path.exists(pic_path):
os.mkdir(pic_path)
with open(os.path.join(pic_path, str(i) + '.jpg'), 'wb') as f:
# f.write(requests.get(url).content)
print(url)
i += 1
def main():
html = get_one_page('https://www.woyaogexing.com/touxiang/qinglv/new/')
urls = get_urls(html)
for url in urls:
sub_html = get_one_page('https://www.woyaogexing.com' + url)
pic_urls = get_pic_url(sub_html)
for pic_url in pic_urls:
save_pic('http:' + pic_url, 'D:\\test\\')
if __name__ == '__main__':
main()
感谢博主
哈哈哈
测试
我来啦