爬虫综合案例
案例需求
在这个案例中,我们需要去爬取电影天堂的数据! 将该网站最新的电影信息,爬取下来,保存到我们自己的数据库,然后使用我们之前所学过的内容将他们全部展现出来!
案例展示的效果图如下:
程序入口
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 | def main():
try:
# 设定要爬取的列表页面
list_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_2.html"
# 打开列表页面
req = request.urlopen(list_url)
# 获取响应的数据
resp = req.read()
print(req)
# 对响应的内容进行解码
resp_data = resp.decode('gbk',errors="ignore")
# 使用正则表达式提取列表中的数据
result_list = fetch_list(resp_data)
# 提取列表中的每一个详情数据
for item in result_list:
detail = fetch_detail(item)
save(detail)
print(detail)
except UnicodeDecodeError as e:
print('d',type(resp))
print(e)
if __name__ == "__main__":
main()
|
爬取列表页
| def fetch_list(data):
"""
:param data: 列表页面的数据
:return: 提取出来的内容
"""
result_list = re.findall('<a href="(.*)" class="ulink">(.*)</a>',data)
# print(result_list)
# 返回出提取出来的内容
return result_list
|
爬取详情页
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 | def fetch_detail(item):
"""
提取出某个电影的详细内容
:param item: (url,title)
:return: 返回提取出来的内容 (img,title,downloadurl)
"""
url = "http://www.ygdy8.net"+item[0]
# 打开连接
req = request.urlopen(url)
# 读取内容
resp_data = req.read()
# 对数据进行解码
resp_content = resp_data.decode('gbk')
# 提取封面图片
ret = re.search('<img border="0" src="(.*?)"',resp_content)
imgurl = None
if ret:
imgurl = ret.group(1)
else:
print("没有图片:",item[1])
# 设置一张默认图片
imgurl = "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2535338893.jpg"
# 提取下载地址
result = re.search(r'bgcolor="#fdfddf"><a href=\"(.*)\">(\1)</a>',resp_content)
download_url = result.group(1)
return (imgurl,item[1],download_url)
|
保存数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 | # 建立连接
conn = pymysql.connect(host="localhost",port=3306,user="root",password="123456",charset="utf8")
cursor = conn.cursor()
# 创建数据库
cursor.execute("create database if not exists itmovie")
cursor.execute("use itmovie")
cursor.execute("create table if not exists t_movie(id int primary key auto_increment,imgurl varchar(255),title varchar(100),downloadurl varchar(100))")
cursor.close()
def save(detail):
"""
将爬取到的数据保存到数据库中
:param detail: (imgurl,title,download_url)
:return: 不返回
"""
# 创建执行SQL的游标对象
cursor = conn.cursor()
# 执行数据插入对象
ret = cursor.execute("insert into t_movie values(null,%s,%s,%s)",detail)
print("保存的结果:",ret)
cursor.close()
# 提交数据
conn.commit()
|
面向对象封装爬虫
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56 | import requests
import re
import threading
from models import Movie,Session
class MovieSpider():
start_url="http://www.ygdy8.net/html/gndy/dyzz/list_23_1.html"
def run(self):
response = requests.get(self.start_url)
self.parse(response)
def parse(self,response):
response.encoding="gbk"
urls = re.findall('<a href="(.*)" class="ulink">.*</a>',response.text)
for url in urls:
url = "http://www.ygdy8.net"+url
t = threading.Thread(target=self.send_request,args=(url,self.parse_item))
t.start()
def send_request(self,url,callback):
response = requests.get(url)
item = callback(response)
t = threading.Thread(target=self.pipeline,args=(item,))
t.start()
def pipeline(self,item):
movie = Movie()
movie.name = item["name"]
movie.icon = item["icon"]
movie.url = item["url"]
session = Session()
session.add(movie)
session.commit()
session.close()
print("保存数据成功")
def parse_item(self,response):
response.encoding = "gbk"
# 提取标题
name = re.search('<h1><font color=#07519a>(.*)</font></h1>',response.text).group(1)
# 提取图片
icon = re.search('<br /><br />\s*<img.*?src="(.*?)".*?/>\s*<br /><br />',response.text).group(1)
# 提取下载地址
print(name+"===="+icon)
url = re.search('bgcolor="#fdfddf"><a href="(.*?)">.*?</a>',response.text).group(1)
return {"name":name,"icon":icon,"url":url}
if __name__ == '__main__':
MovieSpider().run()
|
简易Http服务器
服务器核心代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37 | import socket
"""
服务端书写思路
1、导入模块
2、创建套接字
3、绑定地址和端口
4、开始监听,设置套接字为被动监听模式
5、等待客户端连接(如果有新客户端连接,会创建新的套接字)
6、处理用户请求
7、生成响应报文,响应内容
8,关闭服务器
"""
# 1、导入模块
# 2、创建套接字
tcp_server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# 3、绑定地址和端口
tcp_server_socket.bind(("", 8000))
# 4、开始监听,设置套接字为被动监听模式
tcp_server_socket.listen(128)
print("服务器启动啦!")
while True:
# 5、等待客户端连接(如果有新客户端连接,会创建新的套接字)
new_client_socket, ip_port = tcp_server_socket.accept()
print('新客户端上线啦:',ip_port)
try:
request_handler(new_client_socket)
except Exception as e:
print("异常信息:",e)
else:
print("服务器响应成功" )
print("服务器关闭!")
# 10、关闭服务器
tcp_server_socket.close()
|
请求处理代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 | def request_handler(new_client_socket):
"""
处理客户端请求
"""
# 接收原始数据
req_data = new_client_socket.recv(4096)
# 对原始数据进行解码
req_content = req_data.decode('utf-8')
# 打印请求的内容
print("打印请求的内容:",req_data)
# 拼接响应的内容
resp_line_and_headers = """
HTTP/1.1 200 OK
Server: one new bility server
Content-Type: text/html; charset=UTF-8
Connection: keep-alive
Accept-Ranges: bytes
"""
resp_body = "<h1>这是来自html页面的数据</h1>"
resp_content = resp_line_and_headers\
+"\r\n"\
+resp_body
print("响应的内容:%s"%resp_content)
new_client_socket.send(resp_content.encode("utf-8"))
new_client_socket.close()
|
展现电影数据
本小节我们使用http服务器+html+css来展现数据库中到数据
模板页面
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39 | <!DOCTYPE html>
<html lang="cn">
<head>
<meta charset="UTF-8">
<title>Title</title>
<script type="text/javascript" src="http://files.cnblogs.com/Zjmainstay/ThunderURIEncode.js"></script>
<style>
.container{
width: 80%;
margin-left: auto;
margin-right: auto;
}
.item{
text-align: center;
width: 200px;
float: left;
margin-top: 10px;
}
p{
word-break: break-all;
word-wrap: break-word;
}
img{
width: 135px;
height: 200px;
}
</style>
</head>
<body>
<div class="container">
<!--<div class="item">
<img src="https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2535338893.jpg" />
<p>流浪地球</p>
<a href="#">点击下载</a>
</div>-->
{{content}}
</div>
</body>
</html>
|
提取数据库数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 | def fetch_movies():
"""
从数据库中提取所有的数据
:return: 返回的列表中包含数据库中所有的电影
"""
# 建立连接
conn = pymysql.connect(host="localhost",port=3306,database="itmovie",user="root",password="123456");
# 获取执行sql的游标对象
cursor = conn.cursor()
# 执行sql
cursor.execute("select * from t_movie")
# 提取结果
result_list = cursor.fetchall()
# 返回查询出来的结果
return result_list
|
生成html内容
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 | def generate_content():
movies = fetch_movies()
# 迭代所有的movie 生成这样的结构出来
str = """
<div class="item">
<img src="%s" />
<p>%s</p>
<a href="%s">点击下载</a>
</div>
"""
result = ""
for movie in movies:
# 只显示具体的标题
ret = re.search("《(.*)》",movie[2])
result+=str%(movie[1],ret.group(1),create_thuder_url(movie[3]))
return result
|
填充模板
| def fill_template():
"""
向模版中填充内容
:return: 返回填充之后的数据
"""
content = generate_content()
with open("template.html","r") as temp:
temp_content = temp.read(4096)
result = re.sub('\{\{content\}\}',content,temp_content)
return result
|
迅雷工具函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14 | import base64
def create_thuder_url(original_address): ge
original_address = str(original_address)
original_address = 'AA' + original_address + 'ZZ'
print(original_address)
original_address = original_address.encode('gbk')
print(original_address)
original_address = base64.b64encode(original_address)
print(original_address)
original_address = 'thunder://' + original_address.decode()
thunder_address = original_address
return thunder_address
|