Python爬取国外天气预报网站的方法
作者:speedmancs 发布时间:2022-02-22 00:39:07
标签:Python,爬取,天气
本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:
crawl_weather.py如下:
#encoding=utf-8
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
lang = "fr"
count = 0
class Location:
# Location(False, "中国", "北京", "zh")
# Location(True, "", "亚洲", "zh")
def __init__(self, is_beyond_country, country_name, loc_name, lang):
self.country_name = country_name
self.loc_name = loc_name
self.lang = lang
self.is_beyond_country = is_beyond_country
prn_lock = threading.RLock()
def GetLocationURLs(url, recursive):
global count
if url.find("weather-forecast") != -1:
count = count + 1
if count % 500 == 0:
prn_lock.acquire()
print "count:%d" % (count)
prn_lock.release()
return [url]
page = urllib2.urlopen(url).read()
time.sleep(0.01)
#"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"
locs = re.findall(pattern, page)
locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]
if not recursive:
urls = [url for url, name in locs]
return urls
urls = []
for _url, _name in locs:
lst = GetLocationURLs(_url, True)
urls.extend(lst)
return urls
#entry_url = "http://www.accuweather.com/zh/browse-locations"
entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)
#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]
#regions = ["eur"]
#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]
#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]
sub_urls = GetLocationURLs(entry_url, False)
print len(sub_urls)
print sub_urls
q = Queue()
location_urls = []
ThreadNum = 5
lock = threading.RLock()
for url in sub_urls:
q.put(url)
def working():
while True:
url = q.get()
lst = GetLocationURLs(url, True)
print "%s %d urls " % (url, len(lst))
lock.acquire()
location_urls.extend(lst)
lock.release()
q.task_done()
for i in range(ThreadNum):
t = Thread(target=working)
t.setDaemon(True)
t.start()
q.join()
fp = open('locations.txt', "w")
fp.write("\n".join(location_urls))
fp.close()
#for url in location_urls:
# print url
#location_urls = GetLocationURLs(entry_url)
'''
def Fetch(url):
try:
print url
web_path = url[0]
local_name = url[1]
print "web_path:", web_path
print "local_name:", local_name
sContent = urllib2.urlopen(web_path).read()
savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)
print savePath
file = open(savePath,'wb')
file.write(sContent)
file.close()
print savePath + " saved";
except:
pass;
def working():
while True:
url = q.get()
Fetch(url)
sleep(10)
q.task_done()
#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
page = urllib2.urlopen(root_url).read()
for i in range(NUM):
t = Thread(target=working)
t.setDaemon(True)
t.start()
urls = copy.deepcopy(ppt_urls)
urls.extend(srt_urls)
urls.extend(video_urls)
print len(ppt_urls)
print len(srt_urls)
print len(video_urls)
print len(urls)
for url in urls:
q.put(url)
q.join()
'''
'''
root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
page = urllib2.urlopen(root_url).read()
print page
'''
FetchLocation.py如下:
#encoding=utf-8
import sys
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
from xml.dom import minidom
import HTMLParser
import datetime
q = Queue()
locks = [threading.RLock() for i in range(2)]
ThreadNumber = 20
locations = {}
conds = {}
def FindCountryBreadCrumbs(page):
lines = page.splitlines()
count = 0
start = -1
opened = False
for line in lines:
if line.find("<ul id=\"country-breadcrumbs\">") != -1:
start = count
opened = True
if opened and line.find("</ul>") != -1:
end = count
opened = False
count = count + 1
return "\n".join(lines[start: (end + 1)])
def GetText(nodelist):
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(HTMLParser.HTMLParser().unescape(node.data))
return ''.join(rc)
def FindCondition(page):
pat = "<span class=\"cond\">(.*?)</span>"
cds = re.findall(pat, page)
cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
return cds
def ExtractInfo(url):
try:
page = urllib2.urlopen(url).read()
except Exception, e:
return []
text = FindCountryBreadCrumbs(page)
text = HTMLParser.HTMLParser().unescape(text)
dom = minidom.parseString(text.encode("utf-8"))
locs = []
lis = dom.getElementsByTagName("li")
for li in lis:
adr_list = li.getElementsByTagName("a")
if adr_list:
locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
strs = li.getElementsByTagName("strong")
if strs:
locs.append(GetText(strs[0].childNodes).encode("utf-8"))
cds = FindCondition(page)
return locs, cds
def AddMap(lst, m):
for x in lst:
if m.get(x) == None:
m[x] = 1
def working():
while True:
urls = q.get()
#print len(urls)
m = {}
m2 = {}
count = 0
for url in urls:
count = count + 1
#print "%d/%d" % (count, len(urls))
locs, cds = ExtractInfo(url)
AddMap(locs, m)
AddMap(cds, m2)
locks[1].acquire()
AddMap(m.keys(), locations)
AddMap(m2.keys(), conds)
locks[1].release()
q.task_done()
def main():
if len(sys.argv) < 2:
exit()
loc_path = sys.argv[1]
fp = open(loc_path, "r")
urls = [line.strip() for line in fp]
fp.close()
#urls = urls[0:1000]
blocks = len(urls) / ThreadNumber + 1
for start in range(0, len(urls), blocks):
end = start + blocks
if end > len(urls):
end = len(urls)
q.put(urls[start:end])
for i in range(ThreadNumber):
t = Thread(target=working)
t.setDaemon(True)
t.start()
q.join()
fp = open("location_name.fr", "w")
fp.write("\n".join(locations.keys()))
fp.close()
fp = open("conditions.fr", "w")
fp.write("\n".join(conds.keys()))
fp.close()
if __name__ == '__main__':
main()
希望本文所述对大家的python程序设计有所帮助。
0
投稿
猜你喜欢
- 目录前言总结:1.代码块的缓存机制2.小数据池3.优缺点小整数对象池大整数对象池字符串驻留机制简单原理:前言本文除"总结"
- django是python语言快速实现web服务的大杀器,其开发效率可以非常的高!但因为秉承了语言的灵活性,django框架又太灵活,以至于
- 本文实例讲述了Sanic框架请求与响应。分享给大家供大家参考,具体如下:前面介绍了Sanic框架的路由,这里接着介绍Sanic框架的请求与响
- 题目描述705. 设计哈希集合不使用任何内建的哈希表库设计一个哈希集合(HashSet)。实现 MyHashSet 类:void add(k
- #!#backup.sh##系统名称sysname=gzsyspath=/home/oracle/databak/$sysname/v_da
- 在JavaScript中,数组本质上是一种特殊的对象,它的类型值会返回 object。如果我们需要比较两个数组是否相等,不能像比较基本类型(
- 本文实例讲述了python实现合并两个数组的方法。分享给大家供大家参考。具体如下:python合并两个数组,将两个数组连接成一个数组,例如,
- 发现错误利用Python库xlrd中的xlrd.open_workbook()函数读取自定义xlsx表格文件时出错如下:Traceback
- 可以通过 reflect.DeepEqual 比较两个 slice/struct/map 是否相等:package main import
- 使用python filecmp模块的dircmp类可以很方便的比对两个目录,dircmp的用法已经有很多文章介绍,不再赘述。可以help(
- javascript cookie的基本操作(添加和删除)1.添加一个cookie:response.addCookie(Cookie c)
- 本文实例讲述了微信小程序MUI导航栏透明渐变功能。分享给大家供大家参考,具体如下:导航栏透明渐变效果实现原理1. 给page-group设置
- BOF 指示当前记录位置位于 Recordset 对象的第一个记录之前。EOF 指示当前记录位置位于 Recordset 对象的最后一个记录
- 国外有很多优秀的文章可以用来学习,我决定花些时间翻译。我并不知道这篇文章有没有人翻译过,原文名 10 Awful IE Bugs and F
- 需求针对tab键带来的多个空格问题,有时候我们针对带空格的一行数据要进行切割,如果有多个空格就会出现就会切割空格出现,我们想把空格都去掉,所
- 一、JDBC是什么?JDBC 指 Java 数据库连接(Java Database Connectivity),是一种标准Java应用编程接
- 之前写了个python脚本用selenium+phantomjs爬新帖子,在循环拉取页面的过程中,phantomjs总是block住,使用W
- 如下所示:def str_to_hex(s): s = s.split(' ')&nbs
- 在开始之前还是提一下三个函数吧:"ob_start()、ob_end_clean()、ob_get_contents()"
- 初学OpenCV图像处理的小伙伴肯定对什么高斯函数、滤波处理、阈值二值化等特性非常头疼,这里给各位分享一个小项目,可通过摄像头实时动态查看各