这是一个百度图片爬虫,基于python3开发,提高了爬虫的容错率、稳定性,可以单线程爬取百度图片。
具体代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
#!/usr/bin/env python # -*- coding:utf-8 -*- import os import re import urllib import json import socket import urllib.request import urllib.parse import urllib.error # 设置超时 import time timeout = 5 socket.setdefaulttimeout(timeout) class Crawler: # 睡眠时长 __time_sleep = 0.1 __amount = 0 __start_amount = 0 __counter = 0 # 获取图片url内容等 # t 下载图片时间间隔 def __init__(self, t=0.1): self.time_sleep = t # 开始获取 def __getImages(self, word='美女'): search = urllib.parse.quote(word) # pn int 图片数 pn = self.__start_amount while pn < self.__amount: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} url = 'http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=' + search + '&cg=girl&pn=' + str( pn) + '&rn=60&itg=0&z=0&fr=&width=&height=&lm=-1&ic=0&s=0&st=-1&gsm=1e0000001e' # 设置header防ban try: time.sleep(self.time_sleep) req = urllib.request.Request(url=url, headers=headers) page = urllib.request.urlopen(req) data = page.read().decode('utf8') except UnicodeDecodeError as e: print('-----UnicodeDecodeErrorurl:', url) # 读取下一页 print("下载下一页") pn += 60 except urllib.error.URLError as e: print("-----urlErrorurl:", url) except socket.timeout as e: print("-----socket timout:", url) else: # 解析json json_data = json.loads(data) self.__saveImage(json_data, word) # 读取下一页 print("下载下一页") pn += 60 finally: page.close() print("下载任务结束") return # 保存图片 def __saveImage(self, json, word): if not os.path.exists("./" + word): os.mkdir("./" + word) # 判断名字是否重复,获取图片长度 self.__counter = len(os.listdir('./' + word)) + 1 for info in json['imgs']: try: if self.__downloadImage(info, word) == False: self.__counter -= 1 except urllib.error.HTTPError as urllib_err: print(urllib_err) pass except Exception as err: time.sleep(1) print(err); print("产生未知错误,放弃保存") continue finally: print("小黄图+1,已有" + str(self.__counter) + "张小黄图") self.__counter += 1 return # 下载图片 def __downloadImage(self, info, word): time.sleep(self.time_sleep) fix = self.__getFix(info['objURL']) urllib.request.urlretrieve(info['objURL'], './' + word + '/' + str(self.__counter) + str(fix)) # 获取后缀名 def __getFix(self, name): m = re.search(r'.[^.]*$', name) if m.group(0) and len(m.group(0)) <= 5: return m.group(0) else: return '.jpeg' # 获取前缀 def __getPrefix(self, name): return name[:name.find('.')] # page_number 需要抓取数据页数 总抓取图片数量为 页数x60 # start_page 起始页数 def start(self, word, spider_page_num=1, start_page=1): self.__start_amount = (start_page - 1) * 60 self.__amount = spider_page_num * 60 + self.__start_amount self.__getImages(word) crawler = Crawler(0.05) # crawler.start('美女', 1, 2) crawler.start('墨镜男', 100) # crawler.start('帅哥', 5) |
运行方式
- 在
index.py
最后一行修改编辑查找关键字 - 图片默认保存在项目路径
- 运行爬虫:
1 2 |
python3 index.py |
博客
效果图: