网络编程
位置:首页>> 网络编程>> Python编程>> python爬取微信公众号文章

python爬取微信公众号文章

作者:七凉可以不悲伤  发布时间:2021-12-30 18:46:35 

标签:python,爬取,微信公众号

本文实例为大家分享了python爬取微信公众号文章的具体代码,供大家参考,具体内容如下


# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import time
import random
import MySQLdb
import threading
import socket
import math

socket.setdefaulttimeout(60)#这里对整个socket层设置超时时间。后续文件中如果再使用到socket,不必再设置

glock = threading.Lock() #定义全局锁

CATEGORY_URL= ['http://www.we123.com/gzh/onclick/'] #获取地区分类链接
all_url = [] #
ALL_URLS = [] #所有详细页面链接
proxy_list = [] #IP池
URL = 'http://www.we123.com'
PAGE_URL = [] #所有分页链接

#获取Ip池
def get_ip():
 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
 url = 'http://http-webapi.zhimaruanjian.com'#可以使用芝麻代理,好用稳定还不贵
 resp = requests.get(url,headers=headers)
 obj = resp.json() #获取json ip池对象
 for ip in obj:
   arr = 'http://' + str(ip['ip']) + ':' + str(ip['port'])
   proxy_list.append(arr)

#获取页面源码函数
def get_html(url):
 # headers = {}
 user_agent_list = [
   'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3538.400 QQBrowser/9.6.12501.400',
   'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
   'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'
 ]
 # user_agent = random.choice(user_agent_list)
 headers = {
   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3538.400 QQBrowser/9.6.12501.400'
 }
 # 代理,免费的代理只能维持一会可能就没用了,自行更换
 # proxy_list = [
 #   "http://27.192.185.62:3252",
 # ]
 # proxy_ip = random.choice(proxy_list)
 # proxies = {'http': proxy_ip}
 # print(str(url))
 try:
   resp = requests.get(url,headers=headers)
   # print("72行:status_code = " + str(resp.status_code))
   # print(type(resp.text))
   # print(resp.url) # 请求的url
   if resp.status_code == 200:
     return resp
   elif resp.status_code == 404:
     return resp
   elif resp.status_code == 500:
     return resp
   return resp
 except RuntimeError:
   print("超时")
   return "error"
 except ConnectionError:
   print("连接超时")
   return "error"
 except RequestException:
   print("http请求父类错误")
   with open('url_exception.txt','a+', encoding='utf-8') as f:
     f.write(str(url))
     f.write('\n')
   return "error"

#获取区域分类链接
def get_categoty_url():
 url = 'http://www.we123.com/gzh/onclick/'
 resp = get_html(url)
 soup = BeautifulSoup(resp.text,'lxml')
 html = soup.select('div.div-subs2 > div.divst-content > div.divst-subs > li > a')
 # 获取区域分类链接
 for i in html:
   city = i['href'].split("/")[-1]
   if (city == '海外' or city == '台湾' or city == '澳门'):
     continue
   url = URL + i['href']
   CATEGORY_URL.append(url)
 print(CATEGORY_URL)

#获取每个区域下所有分页链接
def get_page_url(url):
 city = url.split('/')[-1]
 html = get_html(url)
 if html == "error":
   print("98行:connect url error")
   time.sleep(random.randint(10,20))
   return "error"
 soup = BeautifulSoup(html.text,'lxml')
 #获取总条数
 all_nums = soup.select("div.page > a > b")
 if len(all_nums) == 0:
   return "error"
 else:
   all_nums = soup.select("div.page > a > b")[0].get_text()
 #获取总分页数
 all_pages = math.ceil((int(all_nums) / 30))
 #获取所有分页链接
 all_page_url = []
 for i in range(0,int(all_pages)):
   page_url = 'http://www.we123.com/e/action/ListInfo.php?page=' + str(i) + '&classid=45&line=30&tempid=10&orderby=onclick&myorder=0&totalnum=' + str(all_nums)
   all_page_url.append(page_url)
 return all_page_url

# 获取所有详细页面链接
def get_page_urls():
   global PAGE_URL
   c_url = CATEGORY_URL.pop()
   print('121 行:请求链接' + c_url)
   PAGE_URL = get_page_url(c_url) #获取每个区域下面的所有分页链接

# 获取所有详细页面链接
def get_info_urls():
 while True:
   global PAGE_URL #设置全局变量
   glock.acquire() #加锁
   if len(PAGE_URL) == 0:
     glock.release() #解锁
     print('131 行:CATEGORY_URL 为空')
     break
   else:
     p_url = PAGE_URL.pop()
     print('135 行:请求链接' + p_url)
     glock.release() #解锁

glock.acquire() #加锁
     html = get_html(p_url)
     if html == "error":
       print("141行:connect url error")
       time.sleep(2)
       return
     soup = BeautifulSoup(html.text,'lxml')
     info_urls = soup.select('div.gzhRight > div.gzh_list > ul > li > a')
     for x in info_urls:
       i_url = URL + x['href']
       ALL_URLS.append(i_url)
     print("库存链接共:" + str(len(ALL_URLS)))
   glock.release() #解锁
#获取每一页需要的数据
def get_data():
 while True:
   global ALL_URLS #设置全局变量
   glock.acquire() #加锁
   print("当前库存:"+str(len(ALL_URLS)))
   if len(ALL_URLS) == 0:
     glock.release() #解锁
     print('159 行 :ALL_URLS 为空')
     break
   else:
     url = ALL_URLS.pop()
     print("开始抓取数据:" + url)
     glock.release() #解锁
     time.sleep(1) #睡眠1秒钟
     html = get_html(url)
     if html == "error":
       print("168行:connect url error")
       time.sleep(random.randint(2, 4))
       return
     html.encoding='utf-8' #显式地指定网页编码,一般情况可以不用
     soup = BeautifulSoup(html.text,'lxml')
     #公众号名称
     names = soup.select('div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_left > div.gzhtop_logo > h1')
     #微信号id
     accounts = []
     accounts.append(soup.select('div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_left > div.gzhtop_logo > p')[0])
     #微信头像
     imgs = soup.select('div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_left > div.gzhtop_logo > img')
     #公众号二维码
     QR_codes= soup.select('div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_right > img')
     #介绍
     descs = soup.select('div.artcleLeft > div.xcxnry > div.xcxinfo')
     #公众号分类
     categorys = []
     category = ''
     cate = soup.select('div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_left > div.xcx_p > span > a')
     if not len(cate) == 0:
       category = cate[0].get_text()
     else:
       category = '综合'
     glock.acquire() #加锁
     for name,account,img,QR_code,desc in zip(names,accounts,imgs,QR_codes,descs):
       data = {
         'name':name.get_text(),
         'category':category,
         'account':account.get_text().split(":")[-1],
         'img':img['src'],
         'QR_code':QR_code['src'],
         'desc':desc.get_text()
       }
       add_data(data,url)
     glock.release() #解锁
#添加数据
def add_data(data,url):
 con = MySQLdb.connect('127.0.0.1','root','root','test',charset="utf8",use_unicode=True)
 cursor = con.cursor()
 # exit()
 insert_sql = """
   insert ignore into weixin5(w_name,category,account,img,QR_code,introduce)
   VALUES (%s,%s,%s,%s,%s,%s)
   """
 print('212行 :' + data['name'] + '_' + data['account'] + '添加成功!-' + url)
 try:
   cursor.execute(insert_sql,(data['name'],data['category'],data['account'],data['img'],data['QR_code'],str(data['desc'])))
   con.commit()
 except:
   ALL_URLS.insert(0,url)
   print("218行:" + URL + '插入失败')
   con.rollback()
 con.close()

# 将时间字符串转化为时间戳
def time_to(dt):
 timeArray = time.strptime(dt, "%Y年%m月%d日")
 timestamp = int(time.mktime(timeArray))
 return timestamp

#启动多线程爬取
def main():
 for x in range(3):
   th = threading.Thread(target=get_info_urls)
   th.start()
    # get_info_urls()
 time.sleep(3)
 for x in range(5):
   th = threading.Thread(target=get_data)
   th.start()

if __name__ == '__main__':
 # 计时
 t1 = time.time()
 # 调用函数
 get_ip() #获取ip池
 get_page_urls()
 time.sleep(2)
 # get_categoty_url()
 main()
 print(time.time() - t1)

来源:https://blog.csdn.net/qq_32364939/article/details/78442243

0
投稿

猜你喜欢

手机版 网络编程 asp之家 www.aspxhome.com