usa.gov数据实例¶
#初始数据引入import jsonfrom pandas import DataFrame,Seriesimport pandas as pd;import numpy as np#获取文件地址,此处为完整地址path=r'C:\Users\zhengt\Desktop\Study\Python\Jupyter\PythonForDataAnalysis\pydata-book-master\ch02\usagov_bitly_data2012-03-16-1331923249.txt'#用JSON读取网页,返回格式list[Dic]records=[json.loads(line) for line in open(path)]frame=DataFrame(records) #List->dataframe
#用Python对时区进行计数+并条作图#获取时区字段tzimport pandas as pd;import numpy as npfrom pandas import DataFrame,Seriesimport matplotlib.pyplot as plttime_zones=[rec['tz'] for rec in records if 'tz' in rec]# print time_zones[:10] #查看数据结构#自定义计数函数,用字典进记数def get_counts(sequence): counts={} for x in sequence: if x in counts: counts[x]+=1 else: counts[x]=1 return counts #字典返回counts=get_counts(time_zones)#获取TOPndef top_counts(count_dict,n=10): value_key_pairs=[] #函数式语句等于下面for循环 value_key_pairs=[(count,tz) for tz,count in count_dict.items()]# for (tz,count) in count_dict.items():# value_key_pairs.append((count,tz)) value_key_pairs.sort() #排序 return value_key_pairs[-n:] #List返回TOP ntz_items=top_counts(counts) #获取Top n的数列#作柱形图def draw_bar_plot(items): x = [] y = [] #将dict分成两个List for item in items: y.append(item[0]) #count x.append(item[1]) #tz #生成序列,plot只接受数值类型 x1=np.arange(len(x)) #生成数列 plt.barh(x1,y,align='center') #生成条形图 ax=plt.gca() #获取图片格式变量 ax.set_yticks(range(len(x))) #x座标轴范围 ax.set_yticklabels(x,rotation=0) #x座标轴名称,旋转0 plt.show()#调用图表支出draw_bar_plot(tz_items)
%matplotlib inline#用pandas对时区进行计数+画条形图import matplotlib.pyplot as pltfrom pandas import DataFrame,Seriesimport pandas as pd;import numpy as npclean_tz=frame['tz'].fillna('Missing') #填补缺失值NaNclean_tz[clean_tz=='']='Unknown' #填补空值tz_counts=clean_tz.value_counts() #根据tz字段计数tz_counts[:10].plot(kind='barh',rot=0)
#按agent进行分类用户+条形图#对于一个 Series,dropna 返回一个仅含非空数据和索引值的 Series%matplotlib inlineimport matplotlib.pyplot as pltresults=Series(x.split()[0] for x in frame.a.dropna()) #dropna->删除na行results.value_counts()[:5].plot(kind='barh',rot=0) #根据计数画图,top5
#按WINDOWS和非WINDOWS用户%matplotlib inlineimport pandas as pd;import numpy as npfrom pandas import DataFrame,Seriesimport matplotlib.pyplot as pltcframe=frame[frame.a.notnull()] #获取agent不缺失的值#numpy.where(T/F,Condition->T,->F)operating_system=np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')#根据时区对operating_system进行分组by_tz_os=cframe.groupby(['tz',operating_system])#用size对分组结果进行计数,并用unstack进行重塑agg_counts=by_tz_os.size().unstack().fillna(0)indexer=agg_counts.sum(1).argsort()#用take截取最后10行count_subset=agg_counts.take(indexer)[-10:]#作图,stacked堆积条形图count_subset.plot(kind='barh',stacked=True)#显示成百分比normed_subset=count_subset.div(count_subset.sum(1),axis=0)normed_subset.plot(kind='barh',stacked=True)
movielens¶
#初始数据引入import pandas as pd#read_table可以用engine='python'指定读入程序adr='C:\Users\zhengt\Desktop\Study\\Python\Jupyter\PythonForDataAnalysis\pydata-book-master\ch02\movielens'
#读入users信息unames=['user_id','gender','age','occupation','zip']users=pd.read_table(adr+'\\users.dat',sep='::',header=None,names=unames,engine = 'python')users[:5] #查看结构
user_id | gender | age | occupation | zip | |
---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 |
1 | 2 | M | 56 | 16 | 70072 |
2 | 3 | M | 25 | 15 | 55117 |
3 | 4 | M | 45 | 7 | 02460 |
4 | 5 | M | 25 | 20 | 55455 |
#读入users信息unames=['user_id','gender','age','occupation','zip']users=pd.read_table(adr+'\\users.dat',sep='::',header=None,names=unames,engine = 'python')users[:5] #查看结构
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 |
1 | 1 | 661 | 3 | 978302109 |
2 | 1 | 914 | 3 | 978301968 |
3 | 1 | 3408 | 4 | 978300275 |
4 | 1 | 2355 | 5 | 978824291 |
#读入users信息unames=['user_id','gender','age','occupation','zip']users=pd.read_table(adr+'\\users.dat',sep='::',header=None,names=unames,engine = 'python')users[:5] #查看结构
movie_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
#将三个表的数据合并#因为数据的列名是相同的,所以pandas会自动合并data=pd.merge(pd.merge(ratings,users),movies)mean_ratings=data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')#过滤评分数据>=250ratings_by_title=data.groupby('title').size() #根据title记数active_titles=ratings_by_title.index[ratings_by_title>=250]#获取所需行mean_ratings=mean_ratings.ix[active_titles]#女性最喜欢电影top_female_ratings=mean_ratings.sort_values(by='F',ascending=False)
#计算评分差异#男性和女性评分差异最大mean_ratings['diff']=mean_ratings['M']-mean_ratings['F']sorted_by_diff=mean_ratings.sort_values(by='diff') #按差异diff列进行排序#差异top15女性最喜欢sorted_by_diff[:15]#差异top15男性最喜欢sorted_by_diff[::-1][:15]#只找出差异最大电影(并不考虑性别),计算标准差rating_std_by_title=data.groupby('title')['rating'].std()#根据active_title进行过滤rating_std_by_title=rating_std_by_title.ix[active_titles]#根据值对Series进行降序排列rating_std_by_title.sort_values(ascending=False)[:10]
titleDumb & Dumber (1994) 1.321333Blair Witch Project, The (1999) 1.316368Natural Born Killers (1994) 1.307198Tank Girl (1995) 1.277695Rocky Horror Picture Show, The (1975) 1.260177Eyes Wide Shut (1999) 1.259624Evita (1996) 1.253631Billy Madison (1995) 1.249970Fear and Loathing in Las Vegas (1998) 1.246408Bicentennial Man (1999) 1.245533Name: rating, dtype: float64
全美婴儿姓名¶
#初始数据引入import pandas as pdimport numpy as npimport matplotlib.pyplot as plt#目录adr=r'C:\Users\zhengt\Desktop\Study\\Python\Jupyter\PythonForDataAnalysis\pydata-book-master\ch02\names'#将分散txt数据汇总成一个dataframeyears=range(1880,2011)pieces=[]columns=['name','sex','births'] #列标题for year in years: path=adr+'\yob%d.txt' %year #获取文件名 frame=pd.read_csv(path,names=columns,engine = 'python') #读入文件 frame['year']=year pieces.append(frame)#将所有数据整合到一个dataframe中,List->dataframenames=pd.concat(pieces,ignore_index=True) #ignore_index表示不保留原csv行号
%matplotlib inline#根据year和sex进行births(出生人数)聚合total_births=names.pivot_table('births',index='year',columns='sex',aggfunc=sum)#绘图total_births.plot(title='Total births by sex and year')
#计算比例值,0.02表示100个婴儿中2个取了当前这个名字#自定义函数,计算比例值def add_prop(group): #整数除法会取整,指定为float births=group.births.astype(float) group['prop']=births/births.sum() return group#引用自定义函数,添加prop列names=names.groupby(['year','sex']).apply(add_prop)#查看1stnames_group_sum=names.groupby(['year','sex']).births.sum()1.0*names.ix[0,2]/names_group_sum.ix[0,2]names.ix[0,4] #prop列#用np.allclose来检查分组总计值np.allclose(names.groupby(['year','sex']).prop.sum(),1)names.head() #查看数据
name | sex | births | year | prop | |
---|---|---|---|---|---|
0 | Mary | F | 7065 | 1880 | 0.077643 |
1 | Anna | F | 2604 | 1880 | 0.028618 |
2 | Emma | F | 2003 | 1880 | 0.022013 |
3 | Elizabeth | F | 1939 | 1880 | 0.021309 |
4 | Minnie | F | 1746 | 1880 | 0.019188 |
#根据sex/year获取top1000样本#方法1:运用grouped.applydef get_top1000(group): return group.sort_values(by='births',ascending=False)[:1000]grouped=names.groupby(['year','sex'])top1000=grouped.apply(get_top1000)#方法2:运用循环group->添加list->转成dataframe# pieces=[]# for year,group in names.groupby(['year','sex']):# pieces.append(group.sort_values(by='births',ascending=False)[:1000])# top1000=pd.concat(pieces,ignore_index=True)boys=top1000[top1000.sex=='M']girls=top1000[top1000.sex=='F']total_births=top1000.pivot_table('births',index='year',columns='name',aggfunc=sum)
%matplotlib inline#常用名字趋势subset=total_births[['John','Harry','Mary','Marilyn']]subset.plot(subplots=True,figsize=(12,10),grid=False,title="Number of births per year")
%matplotlib inlineimport numpy as np#评佑命名多样性#1.计算1000样本占总样本比例table=top1000.pivot_table('prop',index='year',columns='sex',aggfunc=sum)#np.linspace(0,1.2,13)->将数据0-1.2平分成13段table.plot(title='Sum of table1000.prop by year and sex',yticks=np.linspace(0,1.2,13),xticks=range(1880,2020,10))#2.计算占总出生人数前50%的不同名字数量def get_quantile_count(group,q=0.5): group=group.sort_values(by='prop',ascending=False) #按prop降序 return group.prop.cumsum().searchsorted(q)+1 #查找50%所在位置diversity=top1000.groupby(['year','sex']).apply(get_quantile_count)diversity=diversity.unstack('sex') #将Series分解成dataframe#unstakc会将数据type转成object,需转成数值diversity_num=diversity[['F','M']].astype(float)print diversity.dtypes,diversity_num.dtypesdiversity_num.plot(title="Number of popular names in top 50%")
sexF objectM objectdtype: object sexF float64M float64dtype: object
%matplotlib inlineimport numpy as np#分析最后一个字母get_last_letter=lambda x:x[-1] #获取最后一个字母公式last_letters=names.name.map(get_last_letter) get_last_letter=lambda x:x[-1]last_letters=names.name.map(get_last_letter)last_letters.name='last_letter'table_letter=names.pivot_table('births',index=last_letters,columns=['sex','year'],aggfunc=sum)#选取3年作为样本subtable_1=table_letter.reindex(columns=[1910,1960,2010],level='year')#选取近5年作为样本subtable_2=table_letter.reindex(columns=range(2006,2011,1),level='year')#计算百分比letter_prop_1=subtable_1/subtable_1.sum().astype(float)letter_prop_2=subtable_2/subtable_2.sum().astype(float)fix,axes=plt.subplots(2,2,figsize=(12,8)) #定义分图(row,column)letter_prop_1['M'].plot(kind='bar',rot=0,ax=axes[0,0],title='Male')letter_prop_1['F'].plot(kind='bar',rot=0,ax=axes[1,0],title='Female',legend=False)letter_prop_2['M'].plot(kind='bar',rot=0,ax=axes[0,1],title='Male')letter_prop_2['F'].plot(kind='bar',rot=0,ax=axes[1,1],title='Female',legend=False)
#以d,n,y结尾的趋势%matplotlib inlineletter_prop=table/table.sum().astype(float)dny_ts=letter_prop.ix[['d','n','y'],'M'].T #选择列并转置dny_ts.plot()
#中性名字的变化趋势%matplotlib inlineall_names=top1000.name.unique() #名字唯一值#以lesl开头,Lesley,Leslie..mask=np.array(['lesl' in x.lower() for x in all_names]) #返回T/Flesley_like=all_names[mask]filtered=top1000[top1000.name.isin(lesley_like)]table_lesl=filtered.pivot_table('births',index='year',columns='sex',aggfunc=sum)table_lesl=table_lesl.div(table_lesl.sum(1),axis=0)table_lesl.plot()