python利用xpath爬取网上数据并存储到django模型中
作者:cll_869241 发布时间:2022-01-19 03:46:51
标签:xpath,爬取,django,模型
帮朋友制作一个网站,需要一些产品数据信息,因为是代理其他公司产品,直接爬取代理公司产品数据
1.设计数据库
from django.db import models
from uuslug import slugify
import uuid
import os
def products_directory_path(instance, filename):
ext = filename.split('.')[-1]
filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
# return the whole path to the file
return os.path.join('images', "products", instance.title, filename)
def product_relatedimage_directory_path(instance, filename):
ext = filename.split('.')[-1]
filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
# return the whole path to the file
return os.path.join('images', "product_relatedimage", instance.product.title, filename)
class ProductsCategory(models.Model):
"""产品分类"""
name = models.CharField('产品分类名', max_length=80, unique=True)
description = models.TextField('产品分类描述', blank=True, null=True)
slug = models.SlugField('slug', max_length=80, blank=True, null=True)
parent_category = models.ForeignKey('self', verbose_name="父级分类", blank=True, null=True, on_delete=models.CASCADE)
def save(self, *args, **kwargs):
if not self.id or not self.slug:
self.slug = slugify(self.name)
super().save(*args, **kwargs)
def __str__(self):
return self.name
class Meta:
ordering = ['name']
verbose_name = "产品分类"
verbose_name_plural = verbose_name
class ProductsTag(models.Model):
"""产品标签"""
name = models.CharField('产品标签名', max_length=30, unique=True)
slug = models.SlugField('slug', max_length=40)
def __str__(self):
return self.name
def save(self, *args, **kwargs):
if not self.id or not self.slug:
self.slug = slugify(self.name)
super().save(*args, **kwargs)
class Meta:
ordering = ['name']
verbose_name = "产品标签"
verbose_name_plural = verbose_name
class Product(models.Model):
title = models.CharField('标题', max_length=255, unique=True)
slug = models.SlugField('slug', max_length=255, blank=True, null=True)
jscs = models.TextField('技术参数', blank=True, null=True)
image = models.ImageField(upload_to=products_directory_path, verbose_name="产品图片")
views = models.PositiveIntegerField('浏览量', default=0)
category = models.ForeignKey('ProductsCategory', verbose_name='分类', on_delete=models.CASCADE, blank=True, null=True)
tags = models.ManyToManyField('ProductsTag', verbose_name='标签集合', blank=True)
def save(self, *args, **kwargs):
if not self.id or not self.slug:
self.slug = slugify(self.title)
super().save(*args, **kwargs)
def update_views(self):
self.views += 1
self.save(update_fields=['views'])
def get_pre(self):
return Product.objects.filter(id__lt=self.id).order_by('-id').first()
def get_next(self):
return Product.objects.filter(id__gt=self.id).order_by('id').first()
def __str__(self):
return self.title
class Meta:
verbose_name = "产品"
verbose_name_plural = verbose_name
class ProductAdvantage(models.Model):
content = models.TextField('产品优势', blank=True, null=True)
product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)
def __str__(self):
return self.content
class Meta:
verbose_name = "产品优势"
verbose_name_plural = verbose_name
class ProductBody(models.Model):
body = models.CharField('产品内容', max_length=256, blank=True, null=True)
product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)
def __str__(self):
return self.product.title
class Meta:
verbose_name = "产品内容"
verbose_name_plural = verbose_name
2.脚本编写
2.1编写获取网页源代码函数
def get_one_page(url):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
res = requests.get(url=url, headers=headers)
res.encoding = 'utf-8'
if res.status_code == 200:
return res.text
else:
return None
except Exception:
return None
2.2根据base页面获取所有产品分类页面链接
if __name__ == '__main__':
content = get_one_page(url)
tree = etree.HTML(content)
# 产品分类url
catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
# 处理catgory_urls
for url in catgory_urls:
url = 'http://www.kexinjianji.com' + url
print(url)
2.3根据产品分类页面链接获取对应所有产品链接
if __name__ == '__main__':
content = get_one_page(url)
tree = etree.HTML(content)
# 产品分类
catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
print("产品分类:" + catgory[0])
# 该分类下产品url
urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
# 处理url
for url in urls:
url = 'http://www.kexinjianji.com' + url
print(url)
print("=====================================================")
两者结合起来就可以打印出所有产品链接
if __name__ == '__main__':
content = get_one_page(url)
tree = etree.HTML(content)
# 产品分类url
catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
# 处理catgory_urls
for url in catgory_urls:
url = 'http://www.kexinjianji.com' + url
content = get_one_page(url)
tree = etree.HTML(content)
# 产品分类
catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
print("产品分类:" + catgory[0])
# 该分类下产品url
urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
# 处理url
for url in urls:
url = 'http://www.kexinjianji.com' + url
print(url)
print("=====================================================")
2.2使用xpath解析函数返回产品链接的内容
if __name__ == '__main__':
content = get_one_page(url)
tree = etree.HTML(content)
# 产品名称
title = tree.xpath('//*[@id="wrap"]//h1/text()')
images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
# 产品图片
images_url = 'http://www.kexinjianji.com/' + images[0]
# 性能特点
xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
# 技术参数
jscs = tree.xpath('//table')[0]
jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
# 产品内容
cpnr = tree.xpath('//div[@class="describe"]/p')
print('产品名称:' + title[0])
print('产品图片:' + images_url)
for td in xntd:
print('性能特点:' + td)
print('技术参数:' + jscs_str)
for cp in cpnr:
# string(.) 获取当前标签下所有文本内容
cp = cp.xpath('string(.)')
print('产品内容:' + cp)
print('============================================')
将三者结合在一起就可以获取所有产品信息
if __name__ == '__main__':
content = get_one_page(url)
tree = etree.HTML(content)
# 产品分类url
catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
# 处理catgory_urls
for url in catgory_urls:
url = 'http://www.kexinjianji.com' + url
content = get_one_page(url)
tree = etree.HTML(content)
# 产品分类
catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
# 该分类下产品url
urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
# 处理url
for url in urls:
url = 'http://www.kexinjianji.com' + url
content = get_one_page(url)
try:
tree = etree.HTML(content)
# 产品名称
title = tree.xpath('//*[@id="wrap"]//h1/text()')
images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
# 产品图片
images_url = 'http://www.kexinjianji.com' + images[0]
# 性能特点
xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
# 技术参数
jscs = tree.xpath('//table')[0]
jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
# 产品内容
cpnr = tree.xpath('//div[@class="describe"]/p')
print("产品分类:" + catgory[0])
print('产品链接:' + url)
print('产品名称:' + title[0])
print('产品图片:' + images_url)
for td in xntd:
print('性能特点:' + td.strip())
# print('技术参数:' + jscs_str)
for cp in cpnr:
# string(.) 获取当前标签下所有文本内容
cp = cp.xpath('string(.)')
print('产品内容:' + cp)
print('============================================')
except Exception as e:
print(e)
print('出错url:' + url)
pass
3.存储到django模型
import requests
from lxml.html import etree
import os
import django
import uuid
from django.core.files.base import ContentFile
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiaobanzhan.settings")
django.setup()
from products.models import ProductBody, ProductsCategory, Product, ProductAdvantage
url = 'http://www.kexinjianji.com/product/hzshntjbz_1/'
def get_one_page(url):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
res = requests.get(url=url, headers=headers, timeout=10)
res.encoding = 'utf-8'
if res.status_code == 200:
return res.text
else:
return None
except Exception:
print('aa')
return None
if __name__ == '__main__':
content = get_one_page(url)
tree = etree.HTML(content)
# 产品分类url
catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
# 处理catgory_urls
for url in catgory_urls:
url = 'http://www.kexinjianji.com' + url
content = get_one_page(url)
tree = etree.HTML(content)
# 产品分类
p_catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
# 该分类下产品url
urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
# 处理url
for url in urls:
url = 'http://www.kexinjianji.com' + url
content = get_one_page(url)
try:
tree = etree.HTML(content)
# 产品名称
title = tree.xpath('//*[@id="wrap"]//h1/text()')
images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
# 产品图片
images_url = 'http://www.kexinjianji.com' + images[0]
# 性能特点
xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
# 技术参数
jscs = tree.xpath('//table')[0]
jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
# 产品内容
cpnr = tree.xpath('//div[@class="describe"]/p')
# 判断是否有这分类,没有则新建
catgory = p_catgory[0]
products_catgory = ProductsCategory.objects.filter(name=catgory).exists()
if products_catgory:
products_catgory = ProductsCategory.objects.get(name=catgory)
else:
products_catgory = ProductsCategory(name=catgory)
products_catgory.save()
print(products_catgory)
# 保存产品图片
image_content = requests.get(url=images_url)
ext = images_url.split('.')[-1] # 获取图片类型
filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext) # 随机生成图片名字
upload_image_file = ContentFile(image_content.content, name=filename) # 将图片保存为django类型
product = Product(title=title[0], jscs=jscs_str, image=upload_image_file, category=products_catgory)
product.save()
for td in xntd:
product_advantage = ProductAdvantage()
product_advantage.content = td
product_advantage.product = product
product_advantage.save()
for cp in cpnr:
cp = cp.xpath('string(.)')
product_body = ProductBody()
product_body.body = cp
product_body.product = product
product_body.save()
except Exception as e:
print(e)
print('出错url:' + url)
最后自己手动处理出错url(页面没有获取到技术参数,技术参数是一张图片)
4.总结
1.xpath 获取标签内容时,p标签中嵌套span标签,源码如下
<div class="describe" style="position: relative;">
<p><span>板 宽:</span>1500mm</p>
<p><span>板 厚:</span>4.5 mm</p>
<p><span>出料口:</span>6口</p>
<p><span>重 量:</span>6000 kg</p>
</div>
使用xpath获取p标签内容
我想得到的效果如下
板 宽:1500mm
板 厚:4.5 mm
出料口:6口
重 量:6000 kg
使用以下xpath 只能分开获取,不是想要的效果
//div[@class="describe"]/p/span/text()|//div[@class="describe"]/p/text()
百度之后找到的解决办法,使用xpath(‘string(.)')
1.先获取所有p标签
cpnr = tree.xpath('//div[@class="describe"]/p')
2.使用**string(.)**获取所有标签所有文本
cp = cp.xpath('string(.)')
循环遍历所有p标签即可
来源:https://blog.csdn.net/cll_869241/article/details/114005783


猜你喜欢
- 在crnn训练的时候需要用到lmdb格式的数据集,下面是python生成lmdb个是数据集的代码,注意一定要在linux系统下,否则会读入图
- 前言MySQL在2016年仍然保持强劲的数据库流行度增长趋势。越来越多的客户将自己的应用建立在MySQL数据库之上,甚至是从Oracle迁移
- 具体流程:① 导入相应的包,下载训练集和测试集对应需要的图像数据。②进行图像数据的变换,使图像数据转化成pytorch可识别并计算的张量数据
- 项目中使用的vue,刚好有需求要实现轮播图,突出显示当前图片,两边展示其他图片;通过查各种资料,实现了,故在此记录下来下面我们来看下实现步骤
- 修改models效果如下class EmailVerifyRecord(models.Model): code = models
- 如果按本文操作遇到一些问题报错,如C:\Users\milyyy\AppData\Roaming\npm-cache\_logs\2018-
- 序Python易用,但用好却不易,其中比较头疼的就是包管理和Python不同版本的问题,特别是当你使用Windows的时候。为了解决这些问题
- 安装 SQL2000 时,系统经常会提示:操作被挂起,要求重新启动计算机,如图1: 图1重新启动后,再次安装时问题仍然存在。解决办
- 下面,我们将会介绍 2014 年最火的 21 款JavaScript 框架,专为前端开发者准备的哦:)众所周知, JavaScript 框架
- PTB数据集内容如下:一行保存一个句子;将稀有单词替换成特殊字符 < unk > ;将具体的数字替换 成“N
- 本文实例讲述了js字符串操作方法。分享给大家供大家参考。具体如下:var str="This is my first Script
- 上次帮朋友写过的一个简单切换效果,超级简单,但也比较适用.因为用到了CSS Sprite技术,DEMO中附带了IE6兼容png的JS.核心J
- 如何去读取一个没有表头的二维csv文件(如下图所示)?并以元组的形式表现数据:((1.0, 0.0, 3.0, 180.0), (2.0,
- torch.autograd.backward(variables, grad_variables=None, retain_graph=N
- 代码如下:import functoolsdef memoize(fn): print('start memoize
- 12306火车票购票软件大家都用过,怎么用Python写一个命令行的火车票查看器,要求在命令行敲一行命令来获得你想要的火车票信息,下面通过本
- 生产者代码:# -* coding:utf8 *- from pykafka import KafkaClient host = '
- 思想:4个数字的排列,加上3个运算符的排列,使用后缀表达式的表现如下:情形一:1,2,3,4,+,-,* => 24*24*4情形二:
- 如果有一个多任务多loss的网络,那么在训练时,loss是如何工作的呢?比如下面:model = Model(inputs = input,
- VSCode 必须安装以下插件:首先你必须安装 Golang 插件,然后再给 Go 安装工具包。在 VS Code 中,使用快捷键: com