基本上我有一个包含字符串和整数的csv文件(下面的屏幕截图),我想将“country_id”部分分组为各大洲,例如,“EUROPE”将包含整行“Hubert Hurkacz”和其他欧洲国家等等,如果这有意义的话。
对它们进行分组后,我们想要使用这些数据并找出分组大陆的“ace_%”和“matches_won_%”的平均值,并用它们绘制散点图。但我们不知道该怎么做。
我正在运行 python 3 的 jupyter 中进行编码
from cs103 import *from typing import NamedTuple, Listimport csvimport matplotlib.pyplot as plt
##################
Data Definitions
#The file we are using is called TENNIS_ACE_VS_WIN.csv and it includes rank, name, country name, country ID, ace percentages, and match#won percentages. We chose to use name, country ID, ace percentages and matches won percentages to represent our information. They are#crucial to our project because it is all the information we need in order to achieve our research question.
TennisPlayer = NamedTuple('TennisPlayer', [('name', str),('country_id', str),('ace_percentage', float), #represented as a percentage, (e.g., 20.5 for 20.5%), in range [0, 100]('matches_won_percentage', float)]) #represented as a percentage, (e.g., 20.5 for 20.5%),#in range [0, 100]
#interp. a tennis player including their name, country ID, aces percentage, and matches won percentageHURKACZ = TennisPlayer("Hubert Hurkacz", "POL", 24.94, 59.44)FRITZ = TennisPlayer("Taylor Fritz", "USA", 18.47, 58.26)
@typecheckdef fn_for_tennis_player(tp: TennisPlayer) -> ...:#template based on compoundreturn ...(tp.name,tp.country_id,tp.ace_percentage,tp.matches_won_percentage)
List[TennisPlayer]
interp. a list of TennisPlayer representing the statistics of professional tennis players
LOTP0 = []LOTP1 = [HURKACZ, FRITZ]
@typecheckdef fn_for_lotp(lotp: List[TennisPlayer]) -> ...:#description of the accacc = ... #type: ...for tp in lotp:acc = ...
return ...
Consumed = ...
List[Consumed]
interp. a list of Consumed
LOC0 = []
@typecheckdef fn_for_loc(loc: List[Consumed]) -> ...:... # choose which template body to use for List[Consumed]
2nd part###########
Functions
@typecheck
def main(filename: str) -> None:"""Reads the file from given filename, analyzes the data, returns the result"""# Template from HtDAP, based on function compositionreturn show_scatterplot(read(filename))
@typecheck
def read(filename: str) -> List[TennisPlayer]:"""reads information from 'TENNIS_ACE_VS_WIN.csv' file and returns a list of TennisPlayer data"""#return [] #stub# Template from HtDAP# loc contains the result so farlotp = [] # type: List[TennisPlayer]
with open(filename) as csvfile:
reader = csv.reader(csvfile)
next(reader) # skip header line
for row in reader:
# you may not need to store all the rows, and you may need
# to convert some of the strings to other types
tp = TennisPlayer(row[1], #just a str
row[3], #just a str
parse_float(row[4]), #represents a float, in range [0, 100]
parse_float(row[5])) #represents a float, in range [0, 100]
lotp.append(tp)
return lotp
@typecheck
def analyze(lotp: List[TennisPlayer]) -> Produced:
"""
...
"""
return ...
return show_scatterplot(x_vals, y_vals)
@typecheck
def countries(tp: TennisPlayer) -> List[TennisPlayer]:"""Takes in a list of TennisPlayer and returns a list of the TennisPlayer from a given country"""#return []#the TennisPlayer seen so far from given countryacc = [] #type: List[TennisPlayer]for tp in lotp:if tp.country_id == "POL":acc.append(lotp)elif tp.country_id == "USA":acc.append(lotp)elif tp.country_id == "RUS":acc.append(lotp)elif tp.country_id == "GER":acc.append(lotp)elif tp.country_id == "KAZ":acc.append(lotp)elif tp.country_id == "NED":acc.append(lotp)elif tp.country_id == "GRE":acc.append(lotp)elif tp.country_id == "AUS":acc.append(lotp)elif tp.country_id == "BUL":acc.append(lotp)elif tp.country_id == "CHN":acc.append(lotp)elif tp.country_id == "FRA":acc.append(lotp)elif tp.country_id == "ITA":acc.append(lotp)elif tp.country_id == "SUI":acc.append(lotp)elif tp.country_id == "CAN":acc.append(lotp)elif tp.country_id == "ARG":acc.append(lotp)elif tp.country_id == "SRB":acc.append(lotp)elif tp.country_id == "DEN":acc.append(lotp)elif tp.country_id == "NOR":acc.append(lotp)elif tp.country_id == "ESP":acc.append(lotp)elif tp.country_id == "CZE":acc.append(lotp)elif tp.country_id == "GBR":acc.append(lotp)elif tp.country_id == "FIN":acc.append(lotp)elif tp.country_id == "AUT":acc.append(lotp)elif tp.country_id == "RSA":acc.append(lotp)elif tp.country_id == "JPN":acc.append(lotp)elif tp.country_id == "CRO":acc.append(lotp)elif tp.country_id == "COL":acc.append(lotp)elif tp.country_id == "BRA":acc.append(lotp)elif tp.country_id == "CHI":acc.append(lotp)elif tp.country_id == "PER":acc.append(lotp)elif tp.country_id == "HUN":acc.append(lotp)elif tp.country_id == "BLR":acc.append(lotp)
return acc
**We are not sure how to do this part**
def show_scatterplot(lotp: List[TennisPlayer]) -> None:"""Given a list of ace percentages and a list of matches won percentages, return a scatterplot graph.Colour code the points by country with 32 countries."""# return None # stub
x_vals = get_x_vals(lotp)
y_vals = get_y_vals(lotp)
# set the labels for the axes
plt.xlabel('Ace percentage %')
plt.ylabel('Matches won percentage %')
plt.title('Relationship betweeen Matches Won % and Ace %')
# create the scatterplot
#plt.scatter(x_vals, y_vals, s = 1)
#Colorcoding the lables
#plt.colorbar(scatter, label='country_id')
#Colourcoding points
scatter = plt.scatter(x_vals, y_vals, s = 1)
for tp in lotp:
if tp.country_id == "USA":
plt.setp(scatter, color='#FF0000')
elif tp.country_id == "AUS":
plt.setp(scatter, color='#008000')
else:
plt.setp(scatter, color='#0000FF')
# show the plot
plt.show()
return None
@typecheck
def get_x_vals(lotp: List[TennisPlayer]) -> List[float]:"""takes in a list of TennisPlayer and returns a list of the ace percentages for each player"""#return [] #stub#list of ace percentages seen so faracc = [] #type: List[float]for tp in lotp:acc.append(tp.ace_percentage)
return acc
@typecheck
def get_y_vals(lotp: List[TennisPlayer]) -> List[float]:"""takes in a list of TennisPlayer and returns a list of the matches won percentages for each player"""#return [] #stub#list of matches won percentages seen so faracc = [] #type: List[float]for tp in lotp:acc.append(tp.matches_won_percentage)
return acc
start_testing()
Examples and tests for main
expect(..., ...)
summary()
start_testing()
Examples and tests for read
expect(read("TENNIS_ACE_VS_WIN_empty.csv"), [])
expect(read("TENNIS_ACE_VS_WIN_test1.csv"), [TennisPlayer("Alexei Popyrin", "AUS", 13.39, 53.95),TennisPlayer("Maxime Cressy", "USA", 11.23, 52.43),TennisPlayer("Adrian Mannarino", "FRA", 10.42, 51.82),TennisPlayer("Marco Cecchinato", "ITA", 5.58, 48.05)])
expect(read("TENNIS_ACE_VS_WIN_test2.csv"), [TennisPlayer("Daniil Medvedev", "RUS", 14.72, 55.01),TennisPlayer("John Isner", "USA", 11.81, 53.18),TennisPlayer("Adrian Mannarino", "FRA", 10.42, 51.82),TennisPlayer("Denis Shapovalov", "CAN", 6.84, 49.11)])
show testing summary
summary()
start_testing()
Examples and tests for analyze
expect(..., ...)
summary()
start_testing()expect(countries(LOTP1),summary()
start_testing()expect(show_scatterplot(LOTP1), None)summary()
start_testing()expect(get_x_vals([TennisPlayer("Alexei Popyrin", "AUS", 13.39, 53.95), TennisPlayer("Maxime Cressy", "USA", 11.23, 52.43)]), [13.39, 11.23])expect(get_x_vals([TennisPlayer("Josh Isner", "USA", 11.81, 53.18), TennisPlayer("Maxime Cressy", "USA", 11.23, 52.43),TennisPlayer("Denis Shapovalov", "CAN", 6.84, 49.11)]), [11.81, 11.23, 6.84])expect(get_x_vals([TennisPlayer("Marco Cecchinato", "ITA", 5.58, 48.05), TennisPlayer("Adrian Mannarino", "FRA", 10.42, 51.82)]), [5.58, 10.42])summary()
start_testing()expect(get_y_vals([TennisPlayer("Alexei Popyrin", "AUS", 13.39, 53.95), TennisPlayer("Maxime Cressy", "USA", 11.23, 52.43)]), [53.95, 52.43])expect(get_y_vals([TennisPlayer("Josh Isner", "USA", 11.81, 53.18), TennisPlayer("Maxime Cressy", "USA", 11.23, 52.43),TennisPlayer("Denis Shapovalov", "CAN", 6.84, 49.11)]), [53.18, 52.43, 49.11])expect(get_y_vals([TennisPlayer("Marco Cecchinato", "ITA", 5.58, 48.05), TennisPlayer("Adrian Mannarino", "FRA", 10.42, 51.82)]), [48.05, 51.82])summary()
main("TENNIS_ACE_VS_WIN.csv")
我会考虑使用 pandas。使用数据框对象(例如 Excel 表)结合哪些国家/地区前往哪个大陆的查找列表,这将相对容易完成。