做过爬虫的小伙伴应该都很清楚要伪装自己的请求,包括请求头等,同时也要隐藏自己的ip,不然的话很快自己的ip就被封了,访问不了了。免费的代理ip很多,也有收费的,要是你的资金允许,或者对ip要求比较高的话,建议直接去购买收费的代理ip,直接通过接口就能获取。小编在这里主要是说下爬取的快代理的免费ip的部分,并且验证他对自己要访问的网站是否可以访问,并且保存到redis或者文件当中。
我们打开快代理的网站,免费代理的页面https://www.kuaidaili.com/free/inha/1/
好了,下面下面分享下爬取的代码吧
import requests
from lxml import etree
from fake_useragent import UserAgent
import redis
import threading
import time
pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=1, password='')
# 测试的URL
test_url = "https://www.1688.com/"
def get_proxy(url):
'''爬取快代理的免费代理ip'''
headers= {'User-Agent':str(UserAgent(verify_ssl=False).random)}
response = requests.get(url,headers=headers)
selector = etree.HTML(response.text)
proxies = []
for each in selector.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')[1:]:
ip = each.xpath("./td[1]/text()")[0]
port = each.xpath("./td[2]/text()")[0]
proxy = ip + ":" + port
proxies.append(proxy)
test_proxies(proxies, test_url, 'ip_proxy')
def test_proxies(proxies, test_url , ipname):
'''检查IP是否可以使用'''
proxies = proxies
normal_proxies = []
count = 1
for proxy in proxies:
print("第{}个, {}".format(count, proxy))
count += 1
try:
headers= {'User-Agent':str(UserAgent(verify_ssl=False).random)}
response = requests.get(test_url, headers=headers, proxies={"http": proxy}, timeout=1)
time = response.elapsed.total_seconds()
if response.status_code == 200:
print("{}可以使用".format(proxy))
normal_proxies.append(proxy)
else:
print("{}--{}--可以不可以访问".format(proxy, response.status_code))
except Exception as e:
print("{}无效".format(proxy))
write_proxy(normal_proxies, ipname)
def write_proxy(proxies, name):
'''写入文件或redis'''
for proxy in proxies:
# 写入redis
r = redis.Redis(connection_pool=pool)
r.lpush(name, proxy)
print(r.lrange(name, 0, -1))
# 写入文件
with open("./{}.txt".format(name), 'a+') as f:
f.write(proxy + '\n')
print("录入完成!!!")
if __name__ == "__main__":
base_url = "https://www.kuaidaili.com/free/inha/{}/"
for i in range(1, 3000):
print("第{}页".format(i))
url = base_url.format(i)
get_proxy(url)