网络编程
位置:首页>> 网络编程>> Python编程>> python手机号前7位归属地爬虫代码实例

python手机号前7位归属地爬虫代码实例

作者:wanli001  发布时间:2021-01-23 05:20:34 

标签:python,手机,归属地,爬虫

需求分析

项目上需要用到手机号前7位,判断号码是否合法,还有归属地查询。旧的数据是几年前了太久了,打算用python爬虫重新爬一份

单线程版本


# coding:utf-8
import requests
from datetime import datetime

class PhoneInfoSpider:
 def __init__(self, phoneSections):
   self.phoneSections = phoneSections

def phoneInfoHandler(self, textData):
   text = textData.splitlines(True)
   # print("text length:" + str(len(text)))

if len(text) >= 9:
     number = text[1].split('\'')[1]
     province = text[2].split('\'')[1]
     mobile_area = text[3].split('\'')[1]
     postcode = text[5].split('\'')[1]
     line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode
     line_text = number + "," + province + "," + mobile_area + "," + postcode
     print(line_text)
     # print("province:" + province)

try:
       f = open('./result.txt', 'a')
       f.write(str(line_text) + '\n')
     except Exception as e:
       print(Exception, ":", e)

def requestPhoneInfo(self, phoneNum):
   try:
     url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum
     response = requests.get(url)
     self.phoneInfoHandler(response.text)
   except Exception as e:
     print(Exception, ":", e)

def requestAllSections(self):
   # last用于接上次异常退出前的号码
   last = 0
   # last = 4
   # 自动生成手机号码,后四位补0
   for head in self.phoneSections:
     head_begin = datetime.now()
     print(head + " begin time:" + str(head_begin))

# for i in range(last, 10000):
     for i in range(last, 10):
       middle = str(i).zfill(4)
       phoneNum = head + middle + "0000"
       self.requestPhoneInfo(phoneNum)
     last = 0

head_end = datetime.now()
     print(head + " end time:" + str(head_end))

if __name__ == '__main__':
 task_begin = datetime.now()
 print("phone check begin time:" + str(task_begin))

# 电信,联通,移动,虚拟运营商
 dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']
 lt = ['130', '131', '132', '145', '146', '155', '156', '166', '171', '175', '176', '185', '186', '166']
 yd = ['134', '135', '136', '137', '138', '139', '147', '148', '150', '151', '152', '157', '158', '159', '172',
    '178', '182', '183', '184', '187', '188', '198']
 add = ['170']
 all_num = dx + lt + yd + add

# print(all_num)
 print(len(all_num))

# 要爬的号码段
 spider = PhoneInfoSpider(all_num)
 spider.requestAllSections()

task_end = datetime.now()
 print("phone check end time:" + str(task_end))

发现爬取一个号段,共10000次查询,单线程版大概要多1个半小时,太慢了。

多线程版本


# coding:utf-8
import requests
from datetime import datetime
import queue
import threading

threadNum = 32

class MyThread(threading.Thread):
 def __init__(self, func):
   threading.Thread.__init__(self)
   self.func = func

def run(self):
   self.func()

def requestPhoneInfo():
 global lock
 while True:
   lock.acquire()
   if q.qsize() != 0:
     print("queue size:" + str(q.qsize()))
     p = q.get() # 获得任务
     lock.release()

middle = str(9999 - q.qsize()).zfill(4)
     phoneNum = phone_head + middle + "0000"
     print("phoneNum:" + phoneNum)

try:
       url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum
       # print(url)
       response = requests.get(url)
       # print(response.text)
       phoneInfoHandler(response.text)
     except Exception as e:
       print(Exception, ":", e)
   else:
     lock.release()
     break

def phoneInfoHandler(textData):
 text = textData.splitlines(True)

if len(text) >= 9:
   number = text[1].split('\'')[1]
   province = text[2].split('\'')[1]
   mobile_area = text[3].split('\'')[1]
   postcode = text[5].split('\'')[1]
   line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode
   line_text = number + "," + province + "," + mobile_area + "," + postcode
   print(line_text)
   # print("province:" + province)

try:
     f = open('./result.txt', 'a')
     f.write(str(line_text) + '\n')
   except Exception as e:
     print(Exception, ":", e)

if __name__ == '__main__':
 task_begin = datetime.now()
 print("phone check begin time:" + str(task_begin))

dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']
 lt = ['130', '131', '132', '145', '155', '156', '166', '171', '175', '176', '185', '186', '166']
 yd = ['134', '135', '136', '137', '138', '139', '147', '150', '151', '152', '157', '158', '159', '172', '178',
    '182', '183', '184', '187', '188', '198']
 all_num = dx + lt + yd
 print(len(all_num))

for head in all_num:
   head_begin = datetime.now()
   print(head + " begin time:" + str(head_begin))

q = queue.Queue()
   threads = []
   lock = threading.Lock()

for p in range(10000):
     q.put(p + 1)

print(q.qsize())

for i in range(threadNum):
     middle = str(i).zfill(4)
     global phone_head
     phone_head = head

thread = MyThread(requestPhoneInfo)
     thread.start()
     threads.append(thread)
   for thread in threads:
     thread.join()

head_end = datetime.now()
   print(head + " end time:" + str(head_end))

task_end = datetime.now()
 print("phone check end time:" + str(task_end))

多线程版的1个号码段1000条数据,大概2,3min就好,cpu使用飙升,大概维持在70%左右。

总共40多个号段,爬完大概1,2个小时,总数据41w左右

来源:https://www.cnblogs.com/wanli002/p/11413281.html

0
投稿

猜你喜欢

手机版 网络编程 asp之家 www.aspxhome.com