Date Tags python

1 简介

今天在 kaggle 上看到了一个比较有意思的 Dataset,收录了手游 Pokemon 中 721 只神奇宝贝的基本数据,包括 id,名字(name),类别(type1),二级分类(type2),基本属性(血量:HP, 攻击力:Attack, 防御力:Defense, 魔攻:Special Attack, 魔防:Special Defense, 速度:Speed)。

现参考 kaggle 上的一些文章,做 pokemon 类别对其基本属性影响的分析探究。

2 Pokemon 基本数据概要

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

data_df = pd.read_csv("http://airing.ursb.me/data/Pokemon.csv")
data_df.head()
Out[1]:
# Name Type 1 Type 2 Total HP Attack Defense Sp. Atk Sp. Def Speed Generation Legendary
0 1 Bulbasaur Grass Poison 318 45 49 49 65 65 45 1 False
1 2 Ivysaur Grass Poison 405 60 62 63 80 80 60 1 False
2 3 Venusaur Grass Poison 525 80 82 83 100 100 80 1 False
3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 122 120 80 1 False
4 4 Charmander Fire NaN 309 39 52 43 60 50 65 1 False
In [2]:
# 最后两列没有意义,直接删去
data_df = data_df.drop(['Generation', 'Legendary'], 1)
data_df.describe()
Out[2]:
# Total HP Attack Defense Sp. Atk Sp. Def Speed
count 800.000000 800.00000 800.000000 800.000000 800.000000 800.000000 800.000000 800.000000
mean 362.813750 435.10250 69.258750 79.001250 73.842500 72.820000 71.902500 68.277500
std 208.343798 119.96304 25.534669 32.457366 31.183501 32.722294 27.828916 29.060474
min 1.000000 180.00000 1.000000 5.000000 5.000000 10.000000 20.000000 5.000000
25% 184.750000 330.00000 50.000000 55.000000 50.000000 49.750000 50.000000 45.000000
50% 364.500000 450.00000 65.000000 75.000000 70.000000 65.000000 70.000000 65.000000
75% 539.250000 515.00000 80.000000 100.000000 90.000000 95.000000 90.000000 90.000000
max 721.000000 780.00000 255.000000 190.000000 230.000000 194.000000 230.000000 180.000000
In [3]:
# 先看看 HP 与 Attack 之间的关联
sns.jointplot(x="HP", y="Attack", data=data_df);
plt.show()
In [4]:
# 首先看看各 Pokemon 的数量分布
sns.boxplot(y="Total", data=data_df)
plt.show()
In [5]:
# id 和 Total 对属性研究无意义,删去
data_df_2 = data_df.drop(['#', 'Total'], 1)
sns.boxplot(data=data_df_2)
plt.show()
In [6]:
var_int = data_df_2.dtypes[data_df.dtypes=='int64'].index
var_int = var_int[1:]
var_int
Out[6]:
Index(['Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'], dtype='object')
In [7]:
l_int = len(var_int)
fig = plt.figure(figsize=(13, 8))
for i, val in enumerate(var_int):
    fig.add_subplot(3, 3, i+1)
    plt.hist(data_df_2[val], bins=50)
    plt.title(val)

plt.show()
In [8]:
# 再看看各属性间的相关性
data_df_2.corr()
Out[8]:
HP Attack Defense Sp. Atk Sp. Def Speed
HP 1.000000 0.422386 0.239622 0.362380 0.378718 0.175952
Attack 0.422386 1.000000 0.438687 0.396362 0.263990 0.381240
Defense 0.239622 0.438687 1.000000 0.223549 0.510747 0.015227
Sp. Atk 0.362380 0.396362 0.223549 1.000000 0.506121 0.473018
Sp. Def 0.378718 0.263990 0.510747 0.506121 1.000000 0.259133
Speed 0.175952 0.381240 0.015227 0.473018 0.259133 1.000000

3 探索 Pokemon 类别对其属性的影响

In [9]:
# 统计 Pokemon 的类别
type1 = data_df['Type 1'].unique()
print(type1)

data_type1 = data_df.groupby('Type 1').count()['#']
data_type1.sort_values(ascending=False)
['Grass' 'Fire' 'Water' 'Bug' 'Normal' 'Poison' 'Electric' 'Ground' 'Fairy'
 'Fighting' 'Psychic' 'Rock' 'Ghost' 'Ice' 'Dragon' 'Dark' 'Steel' 'Flying']
Out[9]:
Type 1
Water       112
Normal       98
Grass        70
Bug          69
Psychic      57
Fire         52
Rock         44
Electric     44
Ground       32
Dragon       32
Ghost        32
Dark         31
Poison       28
Steel        27
Fighting     27
Ice          24
Fairy        17
Flying        4
Name: #, dtype: int64
In [10]:
labels = ['Water', 'Normal', 'Grass', 'Bug', 'Psychic', 'Fire', 'Electric', 'Rock', 'Other']
sizes = [112, 98, 70, 69, 57, 52, 44, 44, 175]
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', 'yellow', 'lightgreen', 'silver', 'white', 'pink']
explode = (0, 0, 0, 0, 0, 0, 0, 0, 0.1) 

plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90)
plt.axis('equal')
plt.title("Percentage of Different Types of Pokemon")
plt.show()
In [11]:
# 首先通过箱线图观察各类 Pokemon 的数量分布
type_to_int_dict = { 'Grass': 0, 'Fire': 1, 'Water': 2, 'Bug': 3, 'Normal': 4, 
                    'Poison': 5, 'Electric': 6, 'Ground': 7, 'Fairy': 8, 'Fighting': 9,
                    'Psychic' : 10, 'Rock': 11, 'Ghost':12, 'Ice' : 13, 'Dragon': 14, 
                    'Dark': 15, 'Steel': 16, 'Flying': 17} 
        
data_df['Int_Type1'] = data_df['Type 1'].map(type_to_int_dict).astype(int)

sns.set(style="ticks")
fig, ax = plt.subplots(figsize=(8,6))
sns.boxplot(ax = ax, x="Int_Type1", y="Total", data=data_df, palette="PRGn")
sns.despine(offset=10, trim=True)
plt.show()

# 可以发现龙类的平均数量是远高出其他水平
In [12]:
data_type1 = pd.melt(data_df_2, id_vars=["Name", "Type 1", "Type 2"], var_name="Stat")
data_type1.head()
Out[12]:
Name Type 1 Type 2 Stat value
0 Bulbasaur Grass Poison HP 45
1 Ivysaur Grass Poison HP 60
2 Venusaur Grass Poison HP 80
3 VenusaurMega Venusaur Grass Poison HP 80
4 Charmander Fire NaN HP 39
In [13]:
plt.figure(figsize=(12,10))
plt.ylim(0, 275)
sns.swarmplot(x="Stat", y="value", data=data_type1, hue="Type 1", split=True, size=7)
plt.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.)
plt.show()
In [14]:
# 更加直观地通过箱线图观察各类 Pokemon 的属性数值
fig = plt.figure(figsize=(13,24))
for i, col in enumerate(var_int[:6]):
    ax1 = fig.add_subplot(6, 1, i + 1)
    sns.boxplot(x=data_df['Type 1'], y=data_df_2[col], ax=ax1)       

plt.show()
In [15]:
# 可以发现龙类的 Pokemon 攻击力最高,钢铁类的 Pokemon 防御力最强,飞行类的 Pokemon 速度最快。
# 箱线图展示了分位数的位置,小提琴图则展示了任意位置的密度。
# 这里我们再用小提琴图展示上列数据,会更加直观。

# distribution of HP among all types of pokemon
hp_data = data_df[['Name','Type 1','HP']]
hp_data = hp_data.pivot_table(values = 'HP',index = ['Name'],  columns = ['Type 1'])
hp_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=hp_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("HP of Different Types of Pokemon")
sns.despine(left=True, bottom=True)

plt.show()
In [17]:
# distribution of Attack among all types of pokemon
attack_data = data_df[['Name','Type 1','Attack']]
attack_data = attack_data.pivot_table(values = 'Attack',index = ['Name'],  columns = ['Type 1'])
attack_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=attack_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Attack of Different Types of Pokemon")
sns.despine(left=True, bottom=True)

plt.show()
In [18]:
# distribution of Defense among all types of pokemon
defense_data = data_df[['Name','Type 1','Defense']]
defense_data = defense_data.pivot_table(values = 'Defense',index = ['Name'],  columns = ['Type 1'])
defense_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=defense_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Defense of Different Types of Pokemon")
sns.despine(left=True, bottom=True)

plt.show()
In [22]:
# distribution of Sp.Attack among all types of pokemon
sp_attack_data = data_df[['Name','Type 1','Sp. Atk']]
sp_attack_data = sp_attack_data.pivot_table(values = 'Sp. Atk',index = ['Name'],  columns = ['Type 1'])
sp_attack_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=sp_attack_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Sp.Attack of Different Types of Pokemon")
sns.despine(left=True, bottom=True)

plt.show()
In [23]:
# distribution of Sp.Defense among all types of pokemon
sp_defense_data = data_df[['Name','Type 1','Sp. Def']]
sp_defense_data = sp_defense_data.pivot_table(values = 'Sp. Def',index = ['Name'],  columns = ['Type 1'])
sp_defense_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=sp_defense_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Sp.Defense of Different Types of Pokemon")
sns.despine(left=True, bottom=True)

plt.show()
In [24]:
# distribution of Speed among all types of pokemon
speed_data = data_df[['Name','Type 1','Speed']]
speed_data = speed_data.pivot_table(values = 'Speed',index = ['Name'],  columns = ['Type 1'])
speed_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=speed_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Speed of Different Types of Pokemon")
sns.despine(left=True, bottom=True)

plt.show()