使用socket和线程池爬取数据

使用socket和线程池爬取数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# 使用socket和线程池爬取数据
import socket
from urllib.parse import urlparse


def get_url(url):
#通过socket请求html
url = urlparse(url)
host = url.netloc# 提取主域名
path = url.path
if path == "":
path = "/"

#建立socket连接
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# client.setblocking(False)
client.connect((host, 80)) #阻塞不会消耗cpu

#不停的询问连接是否建立好, 需要while循环不停的去检查状态
#做计算任务或者再次发起其他的连接请求

client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))

data = b""
while True:
d = client.recv(1024)
if d:
data += d
else:
break

data = data.decode("utf8")
html_data = data.split("\r\n\r\n")[1]# 去掉HTTP请求头
# print(data)
print(html_data)
client.close()

if __name__ == "__main__":
import time
start_time = time.time()
# pip install threadpool
import threadpool
pool = threadpool.ThreadPool(10)
urllist = []
for i in range(2,20):
url = "xxx/?page={}/".format(i)
urllist.append(url)

requests = threadpool.makeRequests(get_url,urllist)
[pool.putRequest(req) for req in requests]
pool.wait()

# for url in range(2,5):
# url = "xxx/?page={}/".format(url)
# get_url(url)
print(time.time()-start_time)
坚持原创技术分享,您的支持将鼓励我继续创作!