Python爬取国外天气预报网站的方法
作者:speedmancs 发布时间:2022-02-22 00:39:07
标签:Python,爬取,天气
本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:
crawl_weather.py如下:
#encoding=utf-8
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
lang = "fr"
count = 0
class Location:
# Location(False, "中国", "北京", "zh")
# Location(True, "", "亚洲", "zh")
def __init__(self, is_beyond_country, country_name, loc_name, lang):
self.country_name = country_name
self.loc_name = loc_name
self.lang = lang
self.is_beyond_country = is_beyond_country
prn_lock = threading.RLock()
def GetLocationURLs(url, recursive):
global count
if url.find("weather-forecast") != -1:
count = count + 1
if count % 500 == 0:
prn_lock.acquire()
print "count:%d" % (count)
prn_lock.release()
return [url]
page = urllib2.urlopen(url).read()
time.sleep(0.01)
#"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"
locs = re.findall(pattern, page)
locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]
if not recursive:
urls = [url for url, name in locs]
return urls
urls = []
for _url, _name in locs:
lst = GetLocationURLs(_url, True)
urls.extend(lst)
return urls
#entry_url = "http://www.accuweather.com/zh/browse-locations"
entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)
#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]
#regions = ["eur"]
#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]
#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]
sub_urls = GetLocationURLs(entry_url, False)
print len(sub_urls)
print sub_urls
q = Queue()
location_urls = []
ThreadNum = 5
lock = threading.RLock()
for url in sub_urls:
q.put(url)
def working():
while True:
url = q.get()
lst = GetLocationURLs(url, True)
print "%s %d urls " % (url, len(lst))
lock.acquire()
location_urls.extend(lst)
lock.release()
q.task_done()
for i in range(ThreadNum):
t = Thread(target=working)
t.setDaemon(True)
t.start()
q.join()
fp = open('locations.txt', "w")
fp.write("\n".join(location_urls))
fp.close()
#for url in location_urls:
# print url
#location_urls = GetLocationURLs(entry_url)
'''
def Fetch(url):
try:
print url
web_path = url[0]
local_name = url[1]
print "web_path:", web_path
print "local_name:", local_name
sContent = urllib2.urlopen(web_path).read()
savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)
print savePath
file = open(savePath,'wb')
file.write(sContent)
file.close()
print savePath + " saved";
except:
pass;
def working():
while True:
url = q.get()
Fetch(url)
sleep(10)
q.task_done()
#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
page = urllib2.urlopen(root_url).read()
for i in range(NUM):
t = Thread(target=working)
t.setDaemon(True)
t.start()
urls = copy.deepcopy(ppt_urls)
urls.extend(srt_urls)
urls.extend(video_urls)
print len(ppt_urls)
print len(srt_urls)
print len(video_urls)
print len(urls)
for url in urls:
q.put(url)
q.join()
'''
'''
root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
page = urllib2.urlopen(root_url).read()
print page
'''
FetchLocation.py如下:
#encoding=utf-8
import sys
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
from xml.dom import minidom
import HTMLParser
import datetime
q = Queue()
locks = [threading.RLock() for i in range(2)]
ThreadNumber = 20
locations = {}
conds = {}
def FindCountryBreadCrumbs(page):
lines = page.splitlines()
count = 0
start = -1
opened = False
for line in lines:
if line.find("<ul id=\"country-breadcrumbs\">") != -1:
start = count
opened = True
if opened and line.find("</ul>") != -1:
end = count
opened = False
count = count + 1
return "\n".join(lines[start: (end + 1)])
def GetText(nodelist):
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(HTMLParser.HTMLParser().unescape(node.data))
return ''.join(rc)
def FindCondition(page):
pat = "<span class=\"cond\">(.*?)</span>"
cds = re.findall(pat, page)
cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
return cds
def ExtractInfo(url):
try:
page = urllib2.urlopen(url).read()
except Exception, e:
return []
text = FindCountryBreadCrumbs(page)
text = HTMLParser.HTMLParser().unescape(text)
dom = minidom.parseString(text.encode("utf-8"))
locs = []
lis = dom.getElementsByTagName("li")
for li in lis:
adr_list = li.getElementsByTagName("a")
if adr_list:
locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
strs = li.getElementsByTagName("strong")
if strs:
locs.append(GetText(strs[0].childNodes).encode("utf-8"))
cds = FindCondition(page)
return locs, cds
def AddMap(lst, m):
for x in lst:
if m.get(x) == None:
m[x] = 1
def working():
while True:
urls = q.get()
#print len(urls)
m = {}
m2 = {}
count = 0
for url in urls:
count = count + 1
#print "%d/%d" % (count, len(urls))
locs, cds = ExtractInfo(url)
AddMap(locs, m)
AddMap(cds, m2)
locks[1].acquire()
AddMap(m.keys(), locations)
AddMap(m2.keys(), conds)
locks[1].release()
q.task_done()
def main():
if len(sys.argv) < 2:
exit()
loc_path = sys.argv[1]
fp = open(loc_path, "r")
urls = [line.strip() for line in fp]
fp.close()
#urls = urls[0:1000]
blocks = len(urls) / ThreadNumber + 1
for start in range(0, len(urls), blocks):
end = start + blocks
if end > len(urls):
end = len(urls)
q.put(urls[start:end])
for i in range(ThreadNumber):
t = Thread(target=working)
t.setDaemon(True)
t.start()
q.join()
fp = open("location_name.fr", "w")
fp.write("\n".join(locations.keys()))
fp.close()
fp = open("conditions.fr", "w")
fp.write("\n".join(conds.keys()))
fp.close()
if __name__ == '__main__':
main()
希望本文所述对大家的python程序设计有所帮助。
0
投稿
猜你喜欢
- 强大的group by 代码如下:select stdname, isnull(sum( case stdsubject whe
- 这是python编写的用于测试网站访问速率的代码片段,可以输出打开某url的时间,访问100次的平均时间,最大时间和最小时间等等import
- 引用是什么在 PHP 中引用意味着用不同的名字访问同一个变量内容。这并不像 C 的指针,替代的是,引用是符号表别名。注意在 PHP 中,变量
- k-means算法思想较简单,说的通俗易懂点就是物以类聚,花了一点时间在python中实现k-means算法,k-means算法有本身的缺点
- 本文实例讲述了Python编程中的反模式。分享给大家供大家参考。具体分析如下:Python是时下最热门的编程语言之一了。简洁而富有表达力的语
- 显示图像是 Opencv最基本的操作之一, imshow()函数可以实现该操作。如果使用过其他GUI框架背景,就会很自然地调用 imshow
- 如何显示一个文本文件?完整显示文本文件的代码如下: Write(STRING) WriteLine(STRING) WriteBlan
- 在Pytorch中,torch.utils.data中的Dataset与DataLoader是处理数据集的两个函数,用来处理加载数据集。通常
- 一个客户提供一个股价的信息,要求放在页面上,显示一些数据,需要从远程获取xml,然后解析写在网页上,开始不会觉得很难,其实蛮简单的,先用ja
- 开启Web服务1.基本方式Python中自带了简单的服务器程序,能较容易地打开服务。在python3中将原来的SimpleHTTPServe
- 今天继续给大家介绍Python相关知识,本文主要内容是Python asyncio异步编程简单实现。一、asyncio事件循环简介async
- scrapy框架概述:Scrapy,Python开发的一个快速,高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数
- 见下:<% FOR i = 1 TO 1000 n =
- 这篇文章主要介绍了如何使用Python多线程测试并发漏洞,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的
- 虚拟环境管理创建虚拟环境#默认路径下创建虚拟环境conda create -n pythonVirtual python=x.x # -n:
- 本文实例讲述了php+html5基于websocket实现聊天室的方法。分享给大家供大家参考。具体如下:html5的websocket 实现
- 对于SQL的Join,在学习起来可能是比较乱的。我们知道,SQL的Join语法有很多inner的,有outer的,有left的,有时候,对于
- 一直在期待这本书,一直希望国内能有一本正视WEB标准,并且全面阐述WEB标准书籍。而这本书是我觉得国内最全面的一本关于WEB标准的书籍,这本
- 无论是公司的同事还是外界的程序员朋友们,大部分人对JavaScript的高级应用不甚了解,已有的知识架构里会认为JavaScript仅仅是一
- python中的print()函数和java中的System.out.print()函数都有着打印字符串的功能。python中:print(