基于python实现垂直爬虫系统的方法详解
作者:bwt_D 发布时间:2023-11-17 13:33:38
标签:python,垂直,爬虫,系统
html_downloader
from urllib import request
def download(url):
if url is None:
return
response = request.urlopen(url)
if response.getcode() != 200:
return None
return response.read()
html_outeputer
data_list = []
def collect_data(data):
data_list.append(data)
def output_html():
fout = open('output.html', 'w')
fout.write('<html>')
fout.write('<body>')
fout.write('<table>')
for dataitem in data_list:
fout.write('<tr>')
fout.write('<td>%s</td>' % dataitem['url'])
fout.write('<td>%s</td>' % dataitem['title'])
fout.write('<td>%s</td>' % dataitem['datetime'])
fout.write('<td>%s</td>' % dataitem['visitcount'])
fout.write('</tr>')
fout.write('</table>')
fout.write('</body>')
fout.write('</html>')
fout.close()
html_parser
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def get_new_urls(page_url, soup):
new_urls = set()
links = soup.find_all('a', href=re.compile(r"/\d+/\d+/\w+/page\.htm"))
for link in links:
new_url = link['href']
new_full_url = urljoin(page_url, new_url)
new_urls.add(new_full_url)
return new_urls
def get_new_data(page_url, soup):
res_data = {}
title_node = soup.find('h1', class_='arti-title')
if title_node is None:
return res_data
res_data['title'] = title_node.get_text()
datetime_node = soup.find('span', class_='arti-update')
res_data['datetime'] = datetime_node.get_text()
visitcount_node = soup.find('span', class_='WP_VisitCount')
res_data['visitcount'] = visitcount_node.get_text()
res_data['url'] = page_url
return res_data
def parse(page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
new_urls = get_new_urls(page_url, soup)
new_data = get_new_data(page_url, soup)
return new_urls, new_data
spider_main
import urls_manager, html_downloader, \
html_parser, html_outputer
def craw(root_url):
count = 1
urls_manager.add_new_url(root_url)
#启动爬虫循环
while urls_manager.has_new_url():
new_url = urls_manager.get_new_url()
print('craw %d : %s' % (count, new_url))
html_cont = html_downloader.download(new_url)
new_urls, new_data = html_parser.parse(new_url, html_cont)
urls_manager.add_new_urls(new_urls)
if new_data:
html_outputer.collect_data(new_data)
if count == 10:
break
count = count + 1
html_outputer.output_html()
if __name__ == '__main__':
root_url = 'http://news.zzuli.edu.cn/'
craw(root_url)
import urls_manager, html_downloader, \
html_parser, html_outputer
def craw(root_url):
count = 1
urls_manager.add_new_url(root_url)
#启动爬虫循环
while urls_manager.has_new_url():
new_url = urls_manager.get_new_url()
print('craw %d : %s' % (count, new_url))
html_cont = html_downloader.download(new_url)
new_urls, new_data = html_parser.parse(new_url, html_cont)
urls_manager.add_new_urls(new_urls)
if new_data:
html_outputer.collect_data(new_data)
if count == 10:
break
count = count + 1
html_outputer.output_html()
if __name__ == '__main__':
root_url = 'http://news.zzuli.edu.cn/'
craw(root_url)
test_64
from bs4 import BeautifulSoup
import re
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
print('获取所有链接')
links = soup.find_all('a')
for link in links:
print(link.name, link['href'], link.get_text())
print('获取lacie链接')
link_node = soup.find('a', href='http://example.com/lacie')
print(link_node.name, link_node['href'], link_node.get_text())
print('正则匹配')
link_node = soup.find('a', href=re.compile(r'ill'))
print(link_node.name, link_node['href'], link_node.get_text())
print('获取P段落文字')
p_node = soup.find('p', class_='title')
print(p_node.name, p_node.get_text())
urls_manager
new_urls = set()
old_urls = set()
def add_new_url(url):
if url is None:
return
if url not in new_urls and url not in old_urls:
new_urls.add(url)
def add_new_urls(urls):
if urls is None or len(urls) == 0:
return
for url in urls:
add_new_url(url)
def get_new_url():
new_url = new_urls.pop()
old_urls.add(new_url)
return new_url
def has_new_url():
return len(new_urls) != 0
来源:https://blog.csdn.net/bwt_D/article/details/123238555


猜你喜欢
- 我简单的绘制了一下排序算法的分类,蓝色字体的排序算法是我们用python3实现的,也是比较常用的排序算法。Python3常用排序算法1、Py
- CSS 中的 position 属性可以很容易的将指定的元素定位到理想的位置。但在使用这一属性时需要注意,尤其是在表格元素中。为了说明此问题
- 代码如下: EXEC sp_rename '表名.[原列名]', '新列名', 'column
- 本文实例为大家分享CentOS 7.2 Yum安装mysql5.6的方法,供大家参考,具体内容如下配置CentOS SCLo源[3] 添加
- 每个人都可以编写CSS代码,甚至你现在已经让它为你的项目工作了。但是CSS还可以更好吗?开始用这5个Tips改进你的CSS吧!一、关于CSS
- innerHTML,outerHTML innerHTML检索或设置标签内的内容;outerHTML检索或设置整个标签的内容(包含标签)。&
- 在获取贝壳分的时候用到了时间处理函数,想要获取上个月时间包括年、月、日等# 方法一:today = datetime.date.today(
- 众所周知,由于 GIL 的存在,Python 单进程中的所有操作都是在一个CPU核上进行的,所以为了提高运行速度,我们一般会采用多进程的方式
- 举个简单的例子:(此仅限于修改change_form页面)原来的时候,change_form_list是包含这些按钮的:因为此页面继承了{%
- 变量方法举例备注用变量存储值name = "xxx"print("name ", name )小写字
- 词云图from pyecharts.charts import WordClouddef word1(): words= [ &
- pycharm三个有引号不能自动生成函数注释函数注释自动生成函数注释,包括参数和返回值。使用方法,函数定义时,直接输入三个双引号后回车,例如
- 类似Java打包操作,若不想让人看到Python程序内部逻辑,也可将其转换为exe可执行文件首先自己写一个Python程序,如下:print
- 目录1、前言2、详解1、前言使用中如果我们想把python可识别对象的dict类型的数据通过str类型写入文件或者存入变量中就需要用到dum
- 看代码吧~name = r"\u6697\u88d4\u5251\u9b54"print(name.encode(
- 使用python删除excel表格重复行。# 导入pandas包并重命名为pdimport pandas as pd# 读取Excel中Sh
- 常见到网上博文有错误理论,包括身边很多朋友都曾认同“可访问性(Accessibility)是为残障人士准备”的观点。其实在互联网技术领域远不
- 一、前言这篇文章主要是记录一下php生成静态页面的大致步骤,关于页面静态化,大家想必都听说过,但是真正用的时候反而不是很多。有些页面考虑到访
- 在开发过程中往往需要配合单元测试,但是很多时候,单元测试需要依赖一些比较复杂的准备工作,比如需要依赖数据库环境,需要依赖网络环境,单元测试就
- Fucklt.py 使用了最先进的技术能够使你的代码不管里面有什么样的错误,你只管 FuckIt,程序就能"正常"执行,