Skip to content

Commit e7c336a

Browse files
committed
Initial commit
0 parents  commit e7c336a

8 files changed

+189
-0
lines changed

README.md

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# 慕课网 Python开发简单爬虫 实例代码
2+
3+

__init__.py

Whitespace-only changes.

html_downloader.py

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/usr/bin/env python2
2+
# -*- coding: UTF-8 -*-
3+
import urllib2
4+
5+
6+
class HtmlDownloader(object):
7+
8+
def download(self, url):
9+
if url is None:
10+
return None
11+
response = urllib2.urlopen(url)
12+
if response.getcode() != 200:
13+
return None
14+
return response.read()

html_outputer.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/usr/bin/env python2
2+
# -*- coding: UTF-8 -*-
3+
class HtmlOutputer(object):
4+
5+
def __init__(self):
6+
self.datas = []
7+
8+
9+
def collect_data(self, data):
10+
if data is None:
11+
return
12+
self.datas.append(data)
13+
14+
def output_html(self):
15+
fout = open('output.html', 'w')
16+
fout.write("<html>")
17+
fout.write("<body>")
18+
fout.write("<table>")
19+
20+
for data in self.datas:
21+
fout.write("<tr>")
22+
fout.write("<td>%s</td>" % data['url'])
23+
fout.write("<td>%s</td>" % data['title'].encode('utf-8'))
24+
fout.write("<td>%s</td>" % data['summary'].encode('utf-8'))
25+
fout.write("</tr>")
26+
fout.write("</table>")
27+
fout.write("</body>")
28+
fout.write("</html>")
29+
fout.close()

html_parser.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env python2
2+
# -*- coding: UTF-8 -*-
3+
import re
4+
import urlparse
5+
6+
from bs4 import BeautifulSoup
7+
8+
9+
class HtmlParser(object):
10+
11+
def parse(self, page_url, html_cont):
12+
if page_url is None or html_cont is None:
13+
return
14+
15+
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
16+
new_urls = self._get_new_urls(page_url, soup)
17+
new_data = self._get_new_data(page_url, soup)
18+
return new_urls, new_data
19+
20+
def _get_new_urls(self, page_url, soup):
21+
new_urls = set()
22+
links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm"))
23+
for link in links:
24+
new_url = link['href']
25+
new_full_url = urlparse.urljoin(page_url, new_url)
26+
new_urls.add(new_full_url)
27+
return new_urls
28+
29+
30+
def _get_new_data(self, page_url, soup):
31+
res_data = {}
32+
33+
# url
34+
35+
res_data['url'] = page_url
36+
# <dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>
37+
title_node = soup.find('dd', class_= "lemmaWgt-lemmaTitle-title").find("h1")
38+
res_data['title'] = title_node.get_text()
39+
40+
summary_node = soup.find('div', class_="lemma-summary")
41+
res_data['summary'] = summary_node.get_text()
42+
return res_data

requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
beautifulsoup4==4.4.1
2+
wheel==0.24.0

spider_main.py

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/usr/bin/env python2
2+
# -*- coding: UTF-8 -*-
3+
4+
# 爬虫调度端
5+
6+
## URL管理器
7+
8+
### 添加新的URL到待爬取集合中
9+
### 判断待添加URL是否在容器中
10+
### 获取待爬取URL
11+
### 判断是否还有待爬取URL
12+
### 将URL从待爬取移动到已爬取
13+
14+
## 网页下载器
15+
### urllib2
16+
### requests
17+
18+
## 网页解析器
19+
20+
### 正则表达式
21+
### html.parser
22+
### BeautifulSoup
23+
### lxml
24+
25+
26+
## 分析目标
27+
### URL格式
28+
### 数据格式
29+
### 网页编码
30+
31+
32+
from baike_spider import url_manager, html_downloader, html_outputer, html_parser
33+
34+
35+
class SpiderMain(object):
36+
37+
def __init__(self):
38+
self.urls = url_manager.UrlManager()
39+
self.downloader = html_downloader.HtmlDownloader()
40+
self.parser = html_parser.HtmlParser()
41+
self.outputer = html_outputer.HtmlOutputer()
42+
43+
44+
45+
def craw(self, root_url):
46+
count = 1
47+
self.urls.add_new_url(root_url)
48+
while self.urls.has_new_url():
49+
try :
50+
new_url = self.urls.get_new_url()
51+
print 'craw %d : %s' % (count, new_url)
52+
html_cont = self.downloader.download(new_url)
53+
new_urls, new_data = self.parser.parse(new_url, html_cont)
54+
self.urls.add_new_urls(new_urls)
55+
self.outputer.collect_data(new_data)
56+
57+
58+
if count == 1000:
59+
break
60+
count = count + 1
61+
except:
62+
print 'craw failed'
63+
64+
self.outputer.output_html()
65+
66+
if __name__ == "__main__":
67+
root_url = "http://baike.baidu.com/view/21087.htm"
68+
obj_spider = SpiderMain()
69+
obj_spider.craw(root_url)

url_manager.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env python2
2+
# -*- coding: UTF-8 -*-
3+
class UrlManager(object):
4+
def __init__(self):
5+
self.new_urls = set()
6+
self.old_urls = set()
7+
8+
def add_new_url(self, url):
9+
if url is None:
10+
return
11+
if url not in self.new_urls and url not in self.old_urls:
12+
self.new_urls.add(url)
13+
14+
def add_new_urls(self, urls):
15+
if urls is None or len(urls) == 0:
16+
return
17+
for url in urls:
18+
self.add_new_url(url)
19+
20+
def has_new_url(self):
21+
return len(self.new_urls) != 0
22+
23+
24+
def get_new_url(self):
25+
new_url = self.new_urls.pop()
26+
self.old_urls.add(new_url)
27+
return new_url
28+
29+
30+

0 commit comments

Comments
 (0)