Article / 文章中心

Python数据分析招式:pandas库过滤分组透视表-2

发布时间:2021-11-23 点击数:399

相关: Python数据分析招式:pandas库提取清洗排序-1

要点:

  • 数据的字符处理
  • 数据的过滤
  • 数据的分组
  • 数据的透视表

引入数据

# -*- coding: utf-8 -*-  # @File    : 数据集的处理.py # @Date    : 2018-06-03  import pandas as pd  file = "data/train.csv" df = pd.DataFrame(pd.read_csv(file)) print(df.head(3)) """  PassengerId  Survived  Pclass    ...        Fare Cabin  Embarked 0            1         0       3    ...      7.2500   NaN         S 1            2         1       1    ...     71.2833   C85         C 2            3         1       3    ...      7.9250   NaN         S  [3 rows x 12 columns] """

1、数据集的字符处理

# 对列的处理 mapping ={  'PassengerId': '乘客编号',  'Survived':'是否获救',  'Name':'姓名',  'Pclass':'船舱等级','Sex':'性别',  'Age':'年龄','SibSp':'兄弟姐妹数',  'Parch':'父母小孩数','Ticket':'船票',  'Fare':'船票费' }  ret = df.rename(columns=mapping) print(ret.head(3)) """  乘客编号  是否获救  船舱等级    ...         船票费 Cabin  Embarked 0     1     0     3    ...      7.2500   NaN         S 1     2     1     1    ...     71.2833   C85         C 2     3     1     3    ...      7.9250   NaN         S  [3 rows x 12 columns] """  # 对数据集里面的特定字符串进行替换 ret = df['Sex'].map({'female':'女','male':'男'}) print(ret.head(3)) """ [3 rows x 12 columns] 0    男 1    女 2    女 Name: Sex, dtype: object """  # 对列的字符进行替换, 只保留数字部分 # contains,split,match,findall,endswith df['Embarked']=df['Embarked'].replace(regex='[CS]', value='xxx') print(df.head(3)) """  PassengerId  Survived  Pclass    ...        Fare Cabin  Embarked 0            1         0       3    ...      7.2500   NaN       xxx 1            2         1       1    ...     71.2833   C85       xxx 2            3         1       3    ...      7.9250   NaN       xxx  [3 rows x 12 columns] """

2、数据集的过滤

# 用逻辑表达式组合过滤 ==, != ret = df[(df['Sex']=='female')&(df['Age']>10)] print(ret.head(3)) """  PassengerId  Survived  Pclass    ...        Fare Cabin  Embarked 1            2         1       1    ...     71.2833   C85       xxx 2            3         1       3    ...      7.9250   NaN       xxx 3            4         1       1    ...     53.1000  C123       xxx  [3 rows x 12 columns] """ # query函数 ret = df.query('Age==[10, 20]') print(ret[["Name", "Age"]].head(3)) """  Name   Age 12   Saundercock, Mr. William Henry  20.0 91       Andreasson, Mr. Paul Edvin  20.0 113         Jussila, Miss. Katriina  20.0 """

3、数据的分类

# 用where函数 import numpy as np ret=np.where(df['Age']>=18)  # apply函数 def convert_age(age):  if age> 0 and age < 10:  return "小孩"  elif age < 30:  return "大人"  else:  return "老人"  df["年龄分类"] = df['Age'].apply(convert_age) print(df[["Name", "Age", "年龄分类"]].sample(3)) """  Name   Age 年龄分类 624                   Bowen, Mr. David John "Dai"  21.0   大人 880  Shelley, Mrs. William (Imanita Parrish Hall)  25.0   大人 471                               Cacic, Mr. Luka  38.0   老人 """

4、 数据的切片和透视表

# groupby函数 print(df.groupby('Sex')[['Name', 'Sex']].count()) """  Name  Sex Sex female   314  314 male     577  577 """  # 对数据进行轴切片分析 ret = df.groupby(['Survived','Pclass'])['Age'].agg(['size','max','min','mean']) print(ret) """ Survived Pclass 0        1         80  71.0   2.00  43.695312  2         97  70.0  16.00  33.544444  3        372  74.0   1.00  26.555556 1        1        136  80.0   0.92  35.368197  2         87  62.0   0.67  25.901566  3        119  63.0   0.42  20.646118 """  # 数据透视表 ret = df.pivot_table(columns=['Sex'],index=['Survived','Pclass'],values='Age', aggfunc={'Age':[np.mean,min,max]}) print(ret) """  max             mean               min Sex             female  male     female       male female   male Survived Pclass 0        1        50.0  71.0  25.666667  44.581967   2.00  18.00  2        57.0  70.0  36.000000  33.369048  24.00  16.00  3        48.0  74.0  23.818182  27.255814   2.00   1.00 1        1        63.0  80.0  34.939024  36.248000  14.00   0.92  2        55.0  62.0  28.080882  16.022000   2.00   0.67  3        63.0  45.0  19.329787  22.274211   0.75   0.42 """