NBA 1985-2017 数据分析一
数据收集:
1.爬取网站NBA赛程数据1985赛季至今所有比赛结果;
2.写入mysql
数据概览:
场均得分
主场优势,胜率,主客场分差
初始数据:
主队胜负列:主队胜1,主队负0
赛季 日期 主队 主队得分 客队 客队得分 主队胜负 主队分差 得分总和
0 1985-1986 1985-10-25 老鹰 91 子弹 100 0 -9 191
1 1985-1986 1985-10-25 公牛 116 骑士 115 1 1 231
2 1985-1986 1985-10-25 活塞 118 雄鹿 116 1 2 234
3 1985-1986 1985-10-25 勇士 105 掘金 119 0 -14 224
4 1985-1986 1985-10-25 篮网 113 凯尔特 109 1 4 222
各赛季比赛数:
98-99,11-12赛季停摆,
04年以来,每赛季总比赛在1315场左右,
其中常规赛1230场
17-18赛季至12月27日共497场
df.iloc[:,0:2].groupby('赛季').count().plot(stacked=False)
场均得分变化:
最高85-86赛季220.5分
最低98-99赛季182.5分
呈V字形,从2000年左右最低,
近年场均得分有逐渐提高的趋势,11-12赛季相较10-11有6分的下滑
sav=df.groupby('赛季').mean().loc[:,"得分总和"]
sav.plot(stacked=False,figsize=(10,3))
主队胜率:
主队胜率最高在87-88赛季,68%
最低在14-15赛季,57.6%
总体逐年降低
hw=df.groupby('赛季').mean().loc[:,"主队胜负"]
hw.plot(stacked='False',figsize=(10,3))
主客场分差:
主客场分差最高出现在85-90期间,5-6分
最低在14-15赛季 2.4分
总体呈下滑趋势,与主队胜率分析大体一致,主场优势(此数据包括季后赛)不明显
17-18至今样本数据,分差1.96分
hdiff=df.groupby('赛季').mean().loc[:,"主队分差"]
hdiff.plot(stacked=False,figsize=(10,3))
代码如下:
数据库:
建库NBA,建表gameresult,column:g_id,season,date,host,hostscore,guest,guestscore
导入模块:
import requests
from bs4 import BeautifulSoup
import re
from mysql import connector
数据库类:
class MySql():
def __init__(self, host, user, password, port, db):
self.host = host
self.user = user
self.password = password
self.port = port
self.db = db
self.conn = connector.connect(host=self.host, user=self.user, passwd=self.password, port=self.port, db=self.db)
self.cursor = self.conn.cursor()
#查询
def queryDB(self, table_name, param):
sql = 'select * from ' + table_name + " where season=%s and date=%s and guest=%s and guestscore=%s and hostscore=%s and host=%s;"
self.cursor.execute(sql,param)
def fetchRow(self):
result = self.cursor.fetchone()
return result
#插入,插入前判断是否存在该条记录
def insertDB(self, table_name, param):
self.queryDB(table_name, param)
count = self.fetchRow()
if count == None:
sql = "insert into " + table_name + "(season, date, guest,guestscore,hostscore,host) values(%s, %s, %s, %s, %s, %s);"
self.cursor.execute(sql,param)
def commitDB(self):
self.conn.commit()
def closeDB(self):
self.cursor.close()
self.conn.close()
爬取程序:
def search():
a1 = '1985'
a2 = '1986'
yl = []
while int(a1) <= 2017:
y= ('%s-%s'%(a1,a2))
yl.append(y)
a1,a2 = a2,str(int(a2)+1)
for season in yl:
fy = season[:4]
sy = season[-4:]
ml = (fy+'-10',fy+'-11',fy+'-12',sy+'-01',sy+'-02',sy+'-03',sy+'-04',sy+'-05',sy+'-06')
for m in ml:
URL = ('http://stat-nba.com/gameList_simple-%s.html'%m)
if re.match(r'/d{4}/-/d{4}$',season) and int(season[-4:])-int(season[:4])==1:
req = requests.get(URL)
soup = BeautifulSoup(req.content.decode('utf-8'),'html.parser')
for ml in soup.find_all('div',{'class':"cheight"}):
for gd in ml.find_all('font',{'class':'cheightdate'}):
date = str(gd.string)
for gr in ml.find_all('a'):
gameresult = gr.string.replace(' 76人','SIXER')
guest = re.match(r'^(/w*?)(/d*)/-(/d*)(/w*)$',gameresult)[1]
host = re.match(r'^(/w*?)(/d*)/-(/d*)(/w*)$',gameresult)[4]
guestscore = re.match(r'^(/w*?)(/d*)/-(/d*)(/w*)$',gameresult)[2] or '0'
hostscore = re.match(r'^(/w*?)(/d*)/-(/d*)(/w*)$',gameresult)[3] or '0'
mysql = MySql('127.0.0.1','root','','3306','nba')
param = (season, date, guest,guestscore,hostscore,host)
mysql.insertDB("gameresult",param)
mysql.commitDB()
mysql.closeDB()
print(date,gameresult)
if __name__ == '__main__':
search()
pandas读取数据:
%matplotlib inline
from mysql import connector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
添加列:
def add_result(df):
hsl=[x for x in df.主队得分]
gsl=[y for y in df.客队得分]
rl=[]
for x,y in zip(hsl,gsl):
if x>y:
rl.append(1)
else:
rl.append(0)
df['主队胜负']=rl
# add_result(r2017)
# r2017.head()
def add_diff(df):
hsl=[x for x in df.主队得分]
gsl=[y for y in df.客队得分]
rl=[]
for x,y in zip(hsl,gsl):
rl.append(x-y)
df['主队分差']=rl
def add_sum(df):
hsl=[x for x in df.主队得分]
gsl=[y for y in df.客队得分]
rl=[]
for x,y in zip(hsl,gsl):
rl.append(x+y)
df['得分总和']=rl
导出数据:
conn = connector.connect(user = 'root',password='',database = 'nba')
cursor = conn.cursor()
cursor.execute("select season,date,host,hostscore,guest,guestscore from gameresult where guestscore>0 order by date")
value = cursor.fetchall()
# print(value)
df = pd.DataFrame(value,columns=('赛季','日期','主队','主队得分','客队','客队得分'))
add_result(df)
add_diff(df)
add_sum(df)
大数据培训、人工智能培训、Python培训、大数据培训机构、大数据培训班、数据分析培训、大数据可视化培训,就选光环大数据!光环大数据,聘请专业的大数据领域知名讲师,确保教学的整体质量与教学水准。讲师团及时掌握时代潮流技术,将前沿技能融入教学中,确保学生所学知识顺应时代所需。通过深入浅出、通俗易懂的教学方式,指导学生更快的掌握技能知识,成就上万个高薪就业学子。 更多问题咨询,欢迎点击------>>>>在线客服!