数据分析及可视化

为无为,事无事,味无味。大小多少。抱怨以德。 图难于其易,为大于其细;天下难事,必作于易;天下大事,必作于细。 ——老子

1. Numpy

numpy库是python数据分析的基础,结合了python的易用性和c语言的速度。先来了解一下numpy的数据结构。pandas库是在numpy库基础上开发出来的,两者的数据结构有一定共通性。

# array函数创建数组
vector = np.array([10, 20, 30])
matrix = np.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]])
vector_shape = vector.shape  #(3, )
matrix_shape = matrix.shape  #(3, 3)
vector = np.zeros(10); vector = np.empty(10); 
matrix = np.zeros((3, 6)); matrix = np.empty((3, 6)); 

world_alcohol = np.genfromtxt("world_alcohol.csv", delimiter=",")  #genfromtxt
print(type(world_alcohol))  #ndarray

world_alcohol_dtype = world_alcohol.dtype #array值的数据类型,一个array仅有一个。常用的有bool(True, False), int(int16, int32, int64), float(float16, float32, float64), string(string, unicode)

world_alcohol = np.genfromtxt("world_alcohol.csv", delimiter=",", dtype="U75", skip_header=1)  #参数定义

# numpy切片操作与list类似
# 数组切片是视图不是副本,可用.copy()得到副本。布尔型索引是副本
years_1984 = (world_alcohol[:,0] == "1984")  #得到一维逻辑值向量
world_alcohol[year_1984, :]  #得到一维逻辑向量为真的子矩阵

world_alcohol[:,0][world_alcohol[:,0] == '1986'] = '2014'  #逻辑筛选后改值
is_value_empty = world_alcohol[:,4] == ''  #逻辑向量
world_alcohol[is_value_empty, 4] = '0'  #空值改0

arr = arr.astype(float)  #astype得到副本
arr1 = arr1.astype(arr2.dtype)  #利用dtype属性
vector.sum()     matrix.sum(1)
vector.mean()     matrix.mean(axis=1)
vector.max()     matrix.max(axis=1)

arr = np.random.randn(6, 3)
np.dot(arr.T, arr)  #转置并计算矩阵内积

# 基本数组统计方法
# sum, mean, std, var, min, max, argmin(最小元素索引), argmax, cumsum, cumprod...

vector.sort(); matrix.sort(1)  #sort()就地排序,改变原有数组;多维数组排序需传入轴号 
np.sort(arr)  ######顶级方法返回副本
np.unique(arr)  #副本
np.loadtxt();  np.genfromtxt(); np.savetxt()

2. Pandas

Pandas库是python中用来处理表格型数据最流行最重要最有效的工具。pandas主要数据结构包括series(一维)和dataframe(二维)以及panel(可看成三维,不常用)。相比numpy,pandas最大的优势是可以存储多种数据类型,以及处理缺失值NaN。

import pandas
food_info = pandas.read_csv("food_info.csv")
print(type(food_info))  #<class 'pandas.core.frame.DataFrame'>
print(food_info.head(3))
dimensions = food_info.shape #得到shape元组,shape[0]可得行数
print(food_info.dtypes)  #返回series,索引是列名,值是列的数据类型

# 行和列的选取
### loc[]采用行标签选取,可以是数字、字符或字符串。当行标签是0到n的整数时,与iloc()相同。
hundredth_row = food_info.loc[99]  #loc[]选行
food_info.loc[3:6]  #选多行,左闭右闭#######
food_info.loc[[2,5,10]]  #选多行
ndb_col = food_info["NDB_No"]  #选列
zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]  #选多列

col_names = food_info.columns.tolist()  #.columns返回index对象,tolist()返回列表
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False) #inplace定义在原df上修改,默认增序排列

sex = titanic_survival["sex"]  #得到series对象
sex_is_null = pandas.isnull(sex)  #得到逻辑值series对象
sex_null_true = sex[sex_is_null]  #得到逻辑值为真的series对象

passenger_class_fares = titanic_survival.pivot_table(index="pclass", values="fare", aggfunc=np.mean)  #得到表, 聚合函数aggfunc默认为mean

drop_na_rows = titanic_survival.dropna(axis=0)  #丢弃有缺失值的行
drop_na_columns = titanic_survival.dropna(axis=1) #丢弃有缺失值的列
new_titanic_survival = titanic_survival.dropna(axis=0, subset=["age", "sex"]) #丢弃subset列有缺失值的行

### iloc[]采用整数位置索引,即编号索引
first_five_rows = new_titanic_survival.iloc[0:5]
row_index_25 = new_titanic_survival.loc[25]
row_position_fifth = new_titanic_survival.iloc[4]
# iloc[] 和 loc[]均可同时选行与列
all_rows_first_three_columns = new_titanic_survival.iloc[:,0:3]
row_index_83_age = new_titanic_survival.loc[83,"age"]

titanic_reindexed = new_titanic_survival.reset_index(drop=True) #重设索引,常用于排序后重新定义索引,默认添加新列设置索引,drop=True丢弃原索引

# DataFrame.apply(func) 将函数应用在每一列上, DataFrame.apply(func, axis=1)应用于每一行 
def null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)
column_null_count = titanic_survival.apply(null_count)

s1 = series.sort_index()
s2 = series.sort_values()
np.add(s1, s2); np.sin(s1); np.max(s1)

df = df.set_index('col1', drop=False)  #将col1列作为索引,col1列丢弃。inplace=True参数可以在原对象上修改

3. 数据可视化

通过Numpy和Pandas,我们可以处理各种表格型数据。然后仅仅通过表格很难发现数据规律,而人类大脑进化出了很强的图像处理能力。数据分析的过程中,我们必须经历探索性数据可视化的过程(Exploratory Data Visualization)。

折线图是数据可视化中最常见的图形之一,它反应的是有序的数据之间的关系。

import pandas as pd
unrate = pd.read_csv('unrate.csv')
#pandas读入数据时会把DATE column设为text column,其数据类型为object,需转为datetime类型
unrate['DATE'] = pd.to_datetime(unrate['DATE'])
print(unrate.head(12))

import matplotlib.pyplot as plt
# 单图单线
plt.plot(unrate['DATE'], unrate['VALUE'])
plt.xticks(rotation=90)
plt.xlabel('Month')
plt.ylabel('Unemployment Rate')
plt.title('Monthly Unemployment Trends, 1948')
plt.show()
# 多图单线
fig = plt.figure(figsize=(12,5))
ax1 = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)
ax1.plot(unrate[0:12]['DATE'], unrate[0:12]['VALUE'])
ax1.set_title('Monthly Unemployment Rate, 1948')
ax2.plot(unrate[12:24]['DATE'], unrate[12:24]['VALUE'])
ax2.set_title('Monthly Unemployment Rate, 1949')
plt.show()

fig = plt.figure(figsize=(12,12))
for i in range(5):
    ax = fig.add_subplot(5,1,i+1)
    start_index = i*12
    end_index = (i+1)*12
    subset = unrate[start_index:end_index]
    ax.plot(subset['DATE'], subset['VALUE'])
plt.show()
#单图多线
fig = plt.figure(figsize=(10,6))
colors = ['red', 'blue', 'green', 'orange', 'black']
for i in range(5):
    start_index = i*12
    end_index = (i+1)*12
    subset = unrate[start_index:end_index]
    plt.plot(subset['MONTH'], subset['VALUE'], c=colors[i])
plt.legend(loc='upper left')    
plt.show()

饼状图和散点图是无序数据可视化的常用图形。相比折线图,饼状图和散点图改变数据序号不影响数据的表达。

import matplotlib.pyplot as plt
from numpy import arange

num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
bar_heights = norm_reviews[num_cols].iloc[0].values
bar_positions = arange(5) + 0.75
tick_positions = range(1,6)
fig, ax = plt.subplots()
ax.bar(bar_positions, bar_heights, 0.5)
ax.set_xticks(tick_positions)
ax.set_xticklabels(num_cols, rotation=90)
ax.set_xlabel('Rating Source')
ax.set_ylabel('Average Rating')
ax.set_title('Average User Rating For Avengers: Age of Ultron (2015)')
plt.show()

fig, ax = plt.subplots()
ax.scatter(norm_reviews['Fandango_Ratingvalue'], norm_reviews['RT_user_norm'])
ax.set_xlabel('Fandango')
ax.set_ylabel('Rotten Tomatoes')
plt.show()

柱状图和箱形图

import pandas as pd
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(5,20))
ax1 = fig.add_subplot(4,1,1)
ax2 = fig.add_subplot(4,1,2)
ax3 = fig.add_subplot(4,1,3)
ax4 = fig.add_subplot(4,1,4)
ax1.hist(norm_reviews['Fandango_Ratingvalue'], bins=20, range=(0, 5))
ax1.set_title('Distribution of Fandango Ratings')
ax1.set_ylim(0, 50)
ax2.hist(norm_reviews['RT_user_norm'], 20, range=(0, 5))
ax2.set_title('Distribution of Rotten Tomatoes Ratings')
ax2.set_ylim(0, 50)
ax3.hist(norm_reviews['Metacritic_user_nom'], 20, range=(0, 5))
ax3.set_title('Distribution of Metacritic Ratings')
ax3.set_ylim(0, 50)
ax4.hist(norm_reviews['IMDB_norm'], 20, range=(0, 5))
ax4.set_title('Distribution of IMDB Ratings')
ax4.set_ylim(0, 50)
plt.show()

num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']
fig, ax = plt.subplots()
ax.boxplot(norm_reviews[num_cols].values)
ax.set_xticklabels(num_cols, rotation=90)
ax.set_ylim(0,5)
plt.show()

经过Exploratory Data Visualization确定好需要的图形之后,我们需要让图形更具表现力。 对于折线图,我们只要设置好标签、图例、线型、颜色等就可以了。对于更加丰富的图形,一定要记住一点就是less is more。图形中跟主题无关的称为chartjunk,要最大化data-ink ratio,即表达数据的绘图范围与整个图形面积的比。

ax.tick_params(bottom="off", top="off", left="off", right="off")  #可用来设置刻度线显示
ax.spines["right"].set_visible(False)  #设置外框线的显示。right, bottom, left, top
## plot(a, b, c=color, label='Women', linewidth=3)
## ax.text(x, y, s)
for sp in range(0,6):
    ax = fig.add_subplot(1,6,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)
    for key,spine in ax.spines.items():
        spine.set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    ax.set_title(stem_cats[sp])
    ax.tick_params(bottom="off", top="off", left="off", right="off")
    
    if sp == 0:
        ax.text(2005, 87, 'Men')
        ax.text(2002, 8, 'Women')
    elif sp == 5:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
plt.show()

4.数据清理(Data Cleaning)

在实际的数据分析项目中,很难直接获得理想的数据集,我们总是要么从分散的数据集提取数据重新汇总,要么就得做大量的数据清理工作。

import pandas as pd
data_files = ["ap_2010.csv", "class_size.csv", "demographics.csv", "graduation.csv", "hs_directory.csv", "sat_results.csv"]
data = {}

for f in data_files:
    d = pd.read_csv("schools/{0}".format(f))
    key_name = f.replace(".csv", "")
    data[key_name] = d

## Exploring the Data ##
for k in data:
    print(data[k].head())

## Reading in the Survey Data ##
all_survey = pd.read_csv("schools/survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv("schools/survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0)
print(survey.head())

## Cleaning Up the Surveys ##
survey["DBN"] = survey["dbn"]
survey_fields = ["DBN", "rr_s", "rr_t", "rr_p", "N_s", "N_t", "N_p", "saf_p_11", "com_p_11", "eng_p_11", 
    "aca_p_11", "saf_t_11", "com_t_11", "eng_t_11", "aca_t_11", "saf_s_11", "com_s_11", "eng_s_11", 
    "aca_s_11", "saf_tot_11", "com_tot_11",  "eng_tot_11", "aca_tot_11",]
survey = survey.loc[:,survey_fields]
data["survey"] = survey
print(survey.head())

## Inserting DBN Fields ##
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]

def pad_csd(num):
    string_representation = str(num)
    if len(string_representation) > 1:
        return string_representation
    else:
        return string_representation.zfill(2)
    
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]
print(data["class_size"].head())

## Combining the SAT Scores ##
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
    data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")

data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]
print(data['sat_results']['sat_score'].head())

## Parsing Geographic Coordinates for Schools ##
import re
def find_lat(loc):
    coords = re.findall("\(.+\)", loc)
    lat = coords[0].split(",")[0].replace("(", "")
    return lat

data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
print(data["hs_directory"].head())

## Extracting the Longitude ##

import re
def find_lon(loc):
    coords = re.findall("\(.+\)", loc)
    lon = coords[0].split(",")[1].replace(")", "").strip()
    return lon

data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)
data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")
print(data["hs_directory"].head())

5. 总结

本篇记录了我学习数据分析基础内容的一些东西,主要是作为自己学习记录的一个总结,必要时可用来复习。数据处理远远不止这么些内容,在实际项目中,你可能花在数据处理上的时间比其他任何时间都多。不管这样,这是一个起步。你哪也到达不了,如果你不出发的话。 路长且阻,誓不回头。


The theme of these days: Keeping catching up!

Written on June 18, 2018