python

超轻量级php框架startmvc

Python实现爬虫爬取NBA数据功能示例

更新时间:2020-06-03 23:18:01 作者:startmvc
本文实例讲述了Python实现爬虫爬取NBA数据功能。分享给大家供大家参考,具体如下:爬取的

本文实例讲述了Python实现爬虫爬取NBA数据功能。分享给大家供大家参考,具体如下:

爬取的网站为:stat-nba.com,这里爬取的是NBA2016-2017赛季常规赛至2017年1月7日的数据

改变url_header和url_tail即可爬取特定的其他数据。

源代码如下:


#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import requests
import time
import urllib
from bs4 import BeautifulSoup
import re
from pyExcelerator import *
def getURLLists(url_header,url_tail,pages):
 """
 获取所有页面的URL列表
 """
 url_lists = []
 url_0 = url_header+'0'+url_tail
 print url_0
 url_lists.append(url_0)
 for i in range(1,pages+1):
 url_temp = url_header+str(i)+url_tail
 url_lists.append(url_temp)
 return url_lists
def getNBAAllData(url_lists):
 """
 获取所有2017赛季NBA常规赛数据
 """
 datasets = ['']
 for item in url_lists:
 data1 = getNBASingleData(item)
 datasets.extend(data1)
 #去掉数据里的空元素
 for item in datasets[:]:
 if len(item) == 0:
 datasets.remove(item)
 return datasets
def getNBASingleData(url):
 """
 获取1个页面NBA常规赛数据
 """
 # url = 'http://stat-nba.com/query_team.php?QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017'
 # html = requests.get(url).text
 html = urllib.urlopen(url).read()
 # print html
 soup = BeautifulSoup(html)
 data = soup.html.body.find('tbody').text
 list_data = data.split('\n')
 # with open('nba_data.txt','a') as fp:
 # fp.write(data)
 # for item in list_data[:]:
 # if len(item) == 0:
 # list_data.remove(item)
 return list_data
def saveDataToExcel(datasets,sheetname,filename):
 book = Workbook()
 sheet = book.add_sheet(sheetname)
 sheet.write(0,0,u'序号')
 sheet.write(0,1,u'球队')
 sheet.write(0,2,u'时间')
 sheet.write(0,3,u'结果')
 sheet.write(0,4,u'主客')
 sheet.write(0,5,u'比赛')
 sheet.write(0,6,u'投篮命中率')
 sheet.write(0,7,u'命中数')
 sheet.write(0,8,u'出手数')
 sheet.write(0,9,u'三分命中率')
 sheet.write(0,10,u'三分命中数')
 sheet.write(0,11,u'三分出手数')
 sheet.write(0,12,u'罚球命中率')
 sheet.write(0,13,u'罚球命中数')
 sheet.write(0,14,u'罚球出手数')
 sheet.write(0,15,u'篮板')
 sheet.write(0,16,u'前场篮板')
 sheet.write(0,17,u'后场篮板')
 sheet.write(0,18,u'助攻')
 sheet.write(0,19,u'抢断')
 sheet.write(0,20,u'盖帽')
 sheet.write(0,21,u'失误')
 sheet.write(0,22,u'犯规')
 sheet.write(0,23,u'得分')
 num = 24
 row_cnt = 0
 data_cnt = 0
 data_len = len(datasets)
 print 'data_len:',data_len
 while(data_cnt< data_len):
 row_cnt += 1
 print '序号:',row_cnt
 for col in range(num):
 # print col
 sheet.write(row_cnt,col,datasets[data_cnt])
 data_cnt += 1
 book.save(filename)
def writeDataToTxt(datasets):
 fp = open('nba_data.txt','w')
 line_cnt = 1
 for i in range(len(datasets)-1):
 #球队名称对齐的操作:如果球队名字过短或者为76人队是 球队名字后面加两个table 否则加1个table
 if line_cnt % 24 == 2 and len(datasets[i]) < 5 or datasets[i] == u'费城76人':
 fp.write(datasets[i]+'\t\t')
 else:
 fp.write(datasets[i]+'\t')
 line_cnt += 1
 if line_cnt % 24 == 1:
 fp.write('\n')
 fp.close()
if __name__ == "__main__":
 pages = int(1132/150)
 url_header = 'http://stat-nba.com/query_team.php?page='
 url_tail = '&QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017#label_show_result'
 url_lists = getURLLists(url_header,url_tail,pages)
 datasets = getNBAAllData(url_lists)
 writeDataToTxt(datasets)
 sheetname = 'nba normal data 2016-2017'
 str_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))
 filename = 'nba_normal_data'+str_time+'.xls'
 saveDataToExcel(datasets,sheetname,filename)

Python 爬虫 爬取 NBA数据