使用socket和asynico库爬取数据

### 使用socket和asynico库爬取数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# 使用socket和asynico库爬取数据
#asyncio 没有提供http协议的接口 aiohttp
import asyncio
import socket
from urllib.parse import urlparse


async def get_url(url):
#通过socket请求html
url = urlparse(url)
host = url.netloc
path = url.path
if path == "":
path = "/"

#建立socket连接
reader, writer = await asyncio.open_connection(host,80)
writer.write("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))
all_lines = []
async for raw_line in reader:
data = raw_line.decode("utf8")
all_lines.append(data)
html = "\n".join(all_lines)
html_data = html.split("\r\n\r\n")
print(len(html_data))
return html_data

async def main():
url = "xxx/?page={}"
for i in range(2,20):
task = asyncio.create_task(get_url(url.format(i)))
await task

if __name__ == "__main__":
import time
start_time = time.time()
asyncio.run(main(),debug=True)
print('last time:{}'.format(time.time()-start_time))

https://docs.python.org/zh-cn/3/library/asyncio.html

坚持原创技术分享,您的支持将鼓励我继续创作!