详解Python之Scrapy爬虫教程NBA球员数据存放到Mysql数据库
作者:我不是秃头哆唻咪 发布时间:2024-01-24 03:27:04
标签:Scrapy,爬虫,Mysql
获取要爬取的URL
爬虫前期工作
用Pycharm打开项目开始写爬虫文件
字段文件items
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class NbaprojectItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# pass
# 创建字段的固定格式-->scrapy.Field()
# 英文名
engName = scrapy.Field()
# 中文名
chName = scrapy.Field()
# 身高
height = scrapy.Field()
# 体重
weight = scrapy.Field()
# 国家英文名
contryEn = scrapy.Field()
# 国家中文名
contryCh = scrapy.Field()
# NBA球龄
experience = scrapy.Field()
# 球衣号码
jerseyNo = scrapy.Field()
# 入选年
draftYear = scrapy.Field()
# 队伍英文名
engTeam = scrapy.Field()
# 队伍中文名
chTeam = scrapy.Field()
# 位置
position = scrapy.Field()
# 东南部
displayConference = scrapy.Field()
# 分区
division = scrapy.Field()
爬虫文件
import scrapy
import json
from nbaProject.items import NbaprojectItem
class NbaspiderSpider(scrapy.Spider):
name = 'nbaSpider'
allowed_domains = ['nba.com']
# 第一次爬取的网址,可以写多个网址
# start_urls = ['http://nba.com/']
start_urls = ['https://china.nba.com/static/data/league/playerlist.json']
# 处理网址的response
def parse(self, response):
# 因为访问的网站返回的是json格式,首先用第三方包处理json数据
data = json.loads(response.text)['payload']['players']
# 以下列表用来存放不同的字段
# 英文名
engName = []
# 中文名
chName = []
# 身高
height = []
# 体重
weight = []
# 国家英文名
contryEn = []
# 国家中文名
contryCh = []
# NBA球龄
experience = []
# 球衣号码
jerseyNo = []
# 入选年
draftYear = []
# 队伍英文名
engTeam = []
# 队伍中文名
chTeam = []
# 位置
position = []
# 东南部
displayConference = []
# 分区
division = []
# 计数
count = 1
for i in data:
# 英文名
engName.append(str(i['playerProfile']['firstNameEn'] + i['playerProfile']['lastNameEn']))
# 中文名
chName.append(str(i['playerProfile']['firstName'] + i['playerProfile']['lastName']))
# 国家英文名
contryEn.append(str(i['playerProfile']['countryEn']))
# 国家中文
contryCh.append(str(i['playerProfile']['country']))
# 身高
height.append(str(i['playerProfile']['height']))
# 体重
weight.append(str(i['playerProfile']['weight']))
# NBA球龄
experience.append(str(i['playerProfile']['experience']))
# 球衣号码
jerseyNo.append(str(i['playerProfile']['jerseyNo']))
# 入选年
draftYear.append(str(i['playerProfile']['draftYear']))
# 队伍英文名
engTeam.append(str(i['teamProfile']['code']))
# 队伍中文名
chTeam.append(str(i['teamProfile']['displayAbbr']))
# 位置
position.append(str(i['playerProfile']['position']))
# 东南部
displayConference.append(str(i['teamProfile']['displayConference']))
# 分区
division.append(str(i['teamProfile']['division']))
# 创建item字段对象,用来存储信息 这里的item就是对应上面导的NbaprojectItem
item = NbaprojectItem()
item['engName'] = str(i['playerProfile']['firstNameEn'] + i['playerProfile']['lastNameEn'])
item['chName'] = str(i['playerProfile']['firstName'] + i['playerProfile']['lastName'])
item['contryEn'] = str(i['playerProfile']['countryEn'])
item['contryCh'] = str(i['playerProfile']['country'])
item['height'] = str(i['playerProfile']['height'])
item['weight'] = str(i['playerProfile']['weight'])
item['experience'] = str(i['playerProfile']['experience'])
item['jerseyNo'] = str(i['playerProfile']['jerseyNo'])
item['draftYear'] = str(i['playerProfile']['draftYear'])
item['engTeam'] = str(i['teamProfile']['code'])
item['chTeam'] = str(i['teamProfile']['displayAbbr'])
item['position'] = str(i['playerProfile']['position'])
item['displayConference'] = str(i['teamProfile']['displayConference'])
item['division'] = str(i['teamProfile']['division'])
# 打印爬取信息
print("传输了",count,"条字段")
count += 1
# 将字段交回给引擎 -> 管道文件
yield item
配置文件->开启管道文件
# Scrapy settings for nbaProject project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# ----------不做修改部分---------
BOT_NAME = 'nbaProject'
SPIDER_MODULES = ['nbaProject.spiders']
NEWSPIDER_MODULE = 'nbaProject.spiders'
# ----------不做修改部分---------
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'nbaProject (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ----------修改部分(可以自行查这是啥东西)---------
# ROBOTSTXT_OBEY = True
# ----------修改部分---------
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'nbaProject.middlewares.NbaprojectSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'nbaProject.middlewares.NbaprojectDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# 开启管道文件
# ----------修改部分---------
ITEM_PIPELINES = {
'nbaProject.pipelines.NbaprojectPipeline': 300,
}
# ----------修改部分---------
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
管道文件 -> 将字段写进mysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
class NbaprojectPipeline:
# 初始化函数
def __init__(self):
# 连接数据库 注意修改数据库信息
self.connect = pymysql.connect(host='域名', user='用户名', passwd='密码',
db='数据库', port=端口号)
# 获取游标
self.cursor = self.connect.cursor()
# 创建一个表用于存放item字段的数据
createTableSql = """
create table if not exists `nbaPlayer`(
playerId INT UNSIGNED AUTO_INCREMENT,
engName varchar(80),
chName varchar(20),
height varchar(20),
weight varchar(20),
contryEn varchar(50),
contryCh varchar(20),
experience int,
jerseyNo int,
draftYear int,
engTeam varchar(50),
chTeam varchar(50),
position varchar(50),
displayConference varchar(50),
division varchar(50),
primary key(playerId)
)charset=utf8;
"""
# 执行sql语句
self.cursor.execute(createTableSql)
self.connect.commit()
print("完成了创建表的工作")
#每次yield回来的字段会在这里做处理
def process_item(self, item, spider):
# 打印item增加观赏性
print(item)
# sql语句
insert_sql = """
insert into nbaPlayer(
playerId, engName,
chName,height,
weight,contryEn,
contryCh,experience,
jerseyNo,draftYear
,engTeam,chTeam,
position,displayConference,
division
) VALUES (null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
# 执行插入数据到数据库操作
# 参数(sql语句,用item字段里的内容替换sql语句的占位符)
self.cursor.execute(insert_sql, (item['engName'], item['chName'], item['height'], item['weight']
, item['contryEn'], item['contryCh'], item['experience'], item['jerseyNo'],
item['draftYear'], item['engTeam'], item['chTeam'], item['position'],
item['displayConference'], item['division']))
# 提交,不进行提交无法保存到数据库
self.connect.commit()
print("数据提交成功!")
启动爬虫
屏幕上滚动的数据
去数据库查看数据
简简单单就把球员数据爬回来啦~
来源:https://blog.csdn.net/weixin_44864260/article/details/112986727


猜你喜欢
- 解决MySQL插入时间差八小时问题一般 jdbc url中 需要添加几个参数 , 大多数博客给的教程都是 useSSL=false&
- Ansible Inventory 介绍Ansible Inventory 是包含静态 Inventory 和动态 Invent
- 目录【Python压缩文件夹】导入“zipfile”模块【python压缩文件】导入“zipfile”模块补充zipfile是python里
- NULL值的概念是造成SQL的新手的混淆的普遍原因,他们经常认为NULL是和一个空字符串''的一样的东西。不是这样的!例如,
- 从物理意义上来讲,InnoDB表由共享表空间文件(ibdata1)、独占表空间文件(ibd)、表结构文件(.frm)、以及日志文件(redo
- 一、变量1.变量•指在程序执行过程中,可变的量;•定义一个变量,就会伴随有3个特征,分别是内存ID、数据类型和变量值。•其他语言运行完之前,
- 目录一、v-bind关键源码分析1、v-bind化的属性统一存储在哪里:attrsMap与attrsList2、解析HTML,解析出属性集合
- 第一种, 使用create_connection链接,需要pip install websocket-client (此方法不建议使用,链接
- nodejs和nginx都可以反向代理,解决跨域问题。本地服务const express = require('express
- 目录简介图形加载和说明图形的灰度灰度图像的压缩原始图像的压缩总结简介本文将会以图表的形式为大家讲解怎么在NumPy中进行多维数据的线性代数运
- 使用OpenCV和Python查找图片差异flyfish方法1 均方误差的算法(Mean Squared Error , MSE)下面的一些
- 模态框(Modal Dialogue Box)也可叫做模态对话框,或者对话框,当一个模态框被打开时,用户可以与该对话框进行交互,
- 以下是几个文件操作过程,创建文件,删除文件,修改文件:
- 本文实例讲述了Python实现的读取/更改/写入xml文件操作。分享给大家供大家参考,具体如下:原始文档内容(test.xml):<?
- 以下实验是我在百度公司实习的时候做的,记录下来留个小经验。多GPU训练cifar10_97.23 使用 run.sh 文件开始训练cifar
- 用python的matplotlib画图时,往往需要加图例说明。如果不设置任何参数,默认是加到图像的内侧的最佳位置。import matpl
- 目录前言什么是 websocketwebsocket 通信原理和机制websocket 的特点构建实时日志跟踪的小例子前言websocket
- 简介SSH,Secure Shell,安全外壳协议,用于远程登录会话SFTP,Secret File Transfer Protocol,安
- js判断undefined类型今天使用showModalDialog打开页面,返回值时。当打开的页面点击关闭按钮或直接点浏览器上的关闭则返回
- 下载IDEA、PyCharm、PhpStorm免费激活码本次更新:2020年11月13 (定期更新)推荐教程:IntelliJ IDEA 2