做过爬虫的小伙伴应该都很清楚要伪装自己的请求,包括请求头等,同时也要隐藏自己的ip,不然的话很快自己的ip就被封了,访问不了了。免费的代理ip很多,也有收费的,要是你的资金允许,或者对ip要求比较高的话,建议直接去购买收费的代理ip,直接通过接口就能获取。小编在这里主要是说下爬取的快代理的免费ip的部分,并且验证他对自己要访问的网站是否可以访问,并且保存到redis或者文件当中。
我们打开快代理的网站,免费代理的页面https://www.kuaidaili.com/free/inha/1/
好了,下面下面分享下爬取的代码吧

import requests
from lxml import etree
from fake_useragent import UserAgent
import redis
import threading
import time


pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=1, password='')

# 测试的URL
test_url = "https://www.1688.com/"

def get_proxy(url):
    '''爬取快代理的免费代理ip'''
    headers= {'User-Agent':str(UserAgent(verify_ssl=False).random)}
    response = requests.get(url,headers=headers)
    selector = etree.HTML(response.text)
    proxies = []
    for each in selector.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')[1:]:
        ip = each.xpath("./td[1]/text()")[0]
        port = each.xpath("./td[2]/text()")[0]
        proxy = ip + ":" + port
        proxies.append(proxy)
    test_proxies(proxies, test_url, 'ip_proxy')

def test_proxies(proxies, test_url , ipname):
    '''检查IP是否可以使用'''
    proxies = proxies
    normal_proxies = []
    count = 1
    for proxy in proxies:
        print("第{}个, {}".format(count, proxy))
        count += 1
        try:
            headers= {'User-Agent':str(UserAgent(verify_ssl=False).random)}
            response = requests.get(test_url, headers=headers, proxies={"http": proxy}, timeout=1)
            time = response.elapsed.total_seconds()
            if response.status_code == 200:
                print("{}可以使用".format(proxy))
                normal_proxies.append(proxy)
            else:
                print("{}--{}--可以不可以访问".format(proxy, response.status_code))
        except Exception as e:
            print("{}无效".format(proxy))
    write_proxy(normal_proxies, ipname)

def write_proxy(proxies, name):
    '''写入文件或redis'''
    for proxy in proxies:
        # 写入redis
        r = redis.Redis(connection_pool=pool)
        r.lpush(name, proxy)
        print(r.lrange(name, 0, -1))
        # 写入文件
        with open("./{}.txt".format(name), 'a+') as f:
            f.write(proxy + '\n')
    print("录入完成!!!")

if __name__ == "__main__":
    base_url = "https://www.kuaidaili.com/free/inha/{}/"
    for i in range(1, 3000):
        print("第{}页".format(i))
        url = base_url.format(i)
        get_proxy(url)

5.jpg

Last modification:February 2, 2021
If you think my article is useful to you, please feel free to appreciate