博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
PythonForDataAnalysis-study_2
阅读量:5994 次
发布时间:2019-06-20

本文共 10152 字,大约阅读时间需要 33 分钟。

hot3.png

usa.gov数据实例¶

#初始数据引入import jsonfrom pandas import DataFrame,Seriesimport pandas as pd;import numpy as np#获取文件地址,此处为完整地址path=r'C:\Users\zhengt\Desktop\Study\Python\Jupyter\PythonForDataAnalysis\pydata-book-master\ch02\usagov_bitly_data2012-03-16-1331923249.txt'#用JSON读取网页,返回格式list[Dic]records=[json.loads(line) for line in open(path)]frame=DataFrame(records)               #List->dataframe
#用Python对时区进行计数+并条作图#获取时区字段tzimport pandas as pd;import numpy as npfrom pandas import DataFrame,Seriesimport matplotlib.pyplot as plttime_zones=[rec['tz'] for rec in records if 'tz' in rec]# print time_zones[:10]        #查看数据结构#自定义计数函数,用字典进记数def get_counts(sequence):    counts={}    for x in sequence:        if x in counts:            counts[x]+=1        else:            counts[x]=1    return counts                #字典返回counts=get_counts(time_zones)#获取TOPndef top_counts(count_dict,n=10):    value_key_pairs=[]    #函数式语句等于下面for循环    value_key_pairs=[(count,tz) for tz,count in count_dict.items()]#     for (tz,count) in count_dict.items():#         value_key_pairs.append((count,tz))      value_key_pairs.sort()                   #排序    return value_key_pairs[-n:]              #List返回TOP ntz_items=top_counts(counts)                 #获取Top n的数列#作柱形图def draw_bar_plot(items):    x = []    y = []    #将dict分成两个List    for item in items:        y.append(item[0])            #count        x.append(item[1])            #tz    #生成序列,plot只接受数值类型    x1=np.arange(len(x))             #生成数列     plt.barh(x1,y,align='center')    #生成条形图    ax=plt.gca()                     #获取图片格式变量    ax.set_yticks(range(len(x)))     #x座标轴范围    ax.set_yticklabels(x,rotation=0) #x座标轴名称,旋转0    plt.show()#调用图表支出draw_bar_plot(tz_items)

145702_RSgp_2718942.png

%matplotlib inline#用pandas对时区进行计数+画条形图import matplotlib.pyplot as pltfrom pandas import DataFrame,Seriesimport pandas as pd;import numpy as npclean_tz=frame['tz'].fillna('Missing') #填补缺失值NaNclean_tz[clean_tz=='']='Unknown'       #填补空值tz_counts=clean_tz.value_counts()      #根据tz字段计数tz_counts[:10].plot(kind='barh',rot=0)

145702_D0Df_2718942.png

#按agent进行分类用户+条形图#对于一个 Series,dropna 返回一个仅含非空数据和索引值的 Series%matplotlib inlineimport matplotlib.pyplot as pltresults=Series(x.split()[0] for x in frame.a.dropna())   #dropna->删除na行results.value_counts()[:5].plot(kind='barh',rot=0)      #根据计数画图,top5

145703_KKoM_2718942.png

#按WINDOWS和非WINDOWS用户%matplotlib inlineimport pandas as pd;import numpy as npfrom pandas import DataFrame,Seriesimport matplotlib.pyplot as pltcframe=frame[frame.a.notnull()]     #获取agent不缺失的值#numpy.where(T/F,Condition->T,->F)operating_system=np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')#根据时区对operating_system进行分组by_tz_os=cframe.groupby(['tz',operating_system])#用size对分组结果进行计数,并用unstack进行重塑agg_counts=by_tz_os.size().unstack().fillna(0)indexer=agg_counts.sum(1).argsort()#用take截取最后10行count_subset=agg_counts.take(indexer)[-10:]#作图,stacked堆积条形图count_subset.plot(kind='barh',stacked=True)#显示成百分比normed_subset=count_subset.div(count_subset.sum(1),axis=0)normed_subset.plot(kind='barh',stacked=True)

145703_VymV_2718942.png

145704_1jjR_2718942.png

movielens¶

#初始数据引入import pandas as pd#read_table可以用engine='python'指定读入程序adr='C:\Users\zhengt\Desktop\Study\\Python\Jupyter\PythonForDataAnalysis\pydata-book-master\ch02\movielens'

 

#读入users信息unames=['user_id','gender','age','occupation','zip']users=pd.read_table(adr+'\\users.dat',sep='::',header=None,names=unames,engine = 'python')users[:5]       #查看结构
  user_id gender age occupation zip
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455

 

#读入users信息unames=['user_id','gender','age','occupation','zip']users=pd.read_table(adr+'\\users.dat',sep='::',header=None,names=unames,engine = 'python')users[:5]       #查看结构
  user_id movie_id rating timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291

 

#读入users信息unames=['user_id','gender','age','occupation','zip']users=pd.read_table(adr+'\\users.dat',sep='::',header=None,names=unames,engine = 'python')users[:5]       #查看结构
  movie_id title genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy

 

#将三个表的数据合并#因为数据的列名是相同的,所以pandas会自动合并data=pd.merge(pd.merge(ratings,users),movies)mean_ratings=data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')#过滤评分数据>=250ratings_by_title=data.groupby('title').size()         #根据title记数active_titles=ratings_by_title.index[ratings_by_title>=250]#获取所需行mean_ratings=mean_ratings.ix[active_titles]#女性最喜欢电影top_female_ratings=mean_ratings.sort_values(by='F',ascending=False)

 

#计算评分差异#男性和女性评分差异最大mean_ratings['diff']=mean_ratings['M']-mean_ratings['F']sorted_by_diff=mean_ratings.sort_values(by='diff')    #按差异diff列进行排序#差异top15女性最喜欢sorted_by_diff[:15]#差异top15男性最喜欢sorted_by_diff[::-1][:15]#只找出差异最大电影(并不考虑性别),计算标准差rating_std_by_title=data.groupby('title')['rating'].std()#根据active_title进行过滤rating_std_by_title=rating_std_by_title.ix[active_titles]#根据值对Series进行降序排列rating_std_by_title.sort_values(ascending=False)[:10]

 

titleDumb & Dumber (1994)                     1.321333Blair Witch Project, The (1999)          1.316368Natural Born Killers (1994)              1.307198Tank Girl (1995)                         1.277695Rocky Horror Picture Show, The (1975)    1.260177Eyes Wide Shut (1999)                    1.259624Evita (1996)                             1.253631Billy Madison (1995)                     1.249970Fear and Loathing in Las Vegas (1998)    1.246408Bicentennial Man (1999)                  1.245533Name: rating, dtype: float64

 

全美婴儿姓名¶

#初始数据引入import pandas as pdimport numpy as npimport matplotlib.pyplot as plt#目录adr=r'C:\Users\zhengt\Desktop\Study\\Python\Jupyter\PythonForDataAnalysis\pydata-book-master\ch02\names'#将分散txt数据汇总成一个dataframeyears=range(1880,2011)pieces=[]columns=['name','sex','births']     #列标题for year in years:    path=adr+'\yob%d.txt' %year     #获取文件名    frame=pd.read_csv(path,names=columns,engine = 'python')         #读入文件    frame['year']=year    pieces.append(frame)#将所有数据整合到一个dataframe中,List->dataframenames=pd.concat(pieces,ignore_index=True)       #ignore_index表示不保留原csv行号

 

%matplotlib inline#根据year和sex进行births(出生人数)聚合total_births=names.pivot_table('births',index='year',columns='sex',aggfunc=sum)#绘图total_births.plot(title='Total births by sex and year')

145703_0zoi_2718942.png

 

#计算比例值,0.02表示100个婴儿中2个取了当前这个名字#自定义函数,计算比例值def add_prop(group):    #整数除法会取整,指定为float    births=group.births.astype(float)    group['prop']=births/births.sum()    return group#引用自定义函数,添加prop列names=names.groupby(['year','sex']).apply(add_prop)#查看1stnames_group_sum=names.groupby(['year','sex']).births.sum()1.0*names.ix[0,2]/names_group_sum.ix[0,2]names.ix[0,4]        #prop列#用np.allclose来检查分组总计值np.allclose(names.groupby(['year','sex']).prop.sum(),1)names.head()         #查看数据
  name sex births year prop
0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
#根据sex/year获取top1000样本#方法1:运用grouped.applydef get_top1000(group):    return group.sort_values(by='births',ascending=False)[:1000]grouped=names.groupby(['year','sex'])top1000=grouped.apply(get_top1000)#方法2:运用循环group->添加list->转成dataframe# pieces=[]# for year,group in names.groupby(['year','sex']):#     pieces.append(group.sort_values(by='births',ascending=False)[:1000])# top1000=pd.concat(pieces,ignore_index=True)boys=top1000[top1000.sex=='M']girls=top1000[top1000.sex=='F']total_births=top1000.pivot_table('births',index='year',columns='name',aggfunc=sum)

 

%matplotlib inline#常用名字趋势subset=total_births[['John','Harry','Mary','Marilyn']]subset.plot(subplots=True,figsize=(12,10),grid=False,title="Number of births per year")

145722_fRwG_2718942.png

%matplotlib inlineimport numpy as np#评佑命名多样性#1.计算1000样本占总样本比例table=top1000.pivot_table('prop',index='year',columns='sex',aggfunc=sum)#np.linspace(0,1.2,13)->将数据0-1.2平分成13段table.plot(title='Sum of table1000.prop by year and sex',yticks=np.linspace(0,1.2,13),xticks=range(1880,2020,10))#2.计算占总出生人数前50%的不同名字数量def get_quantile_count(group,q=0.5):    group=group.sort_values(by='prop',ascending=False)   #按prop降序    return group.prop.cumsum().searchsorted(q)+1         #查找50%所在位置diversity=top1000.groupby(['year','sex']).apply(get_quantile_count)diversity=diversity.unstack('sex')                       #将Series分解成dataframe#unstakc会将数据type转成object,需转成数值diversity_num=diversity[['F','M']].astype(float)print diversity.dtypes,diversity_num.dtypesdiversity_num.plot(title="Number of popular names in top 50%")
sexF    objectM    objectdtype: object sexF    float64M    float64dtype: object

145706_PiDz_2718942.png

145707_pfjJ_2718942.png

%matplotlib inlineimport numpy as np#分析最后一个字母get_last_letter=lambda x:x[-1]          #获取最后一个字母公式last_letters=names.name.map(get_last_letter)  get_last_letter=lambda x:x[-1]last_letters=names.name.map(get_last_letter)last_letters.name='last_letter'table_letter=names.pivot_table('births',index=last_letters,columns=['sex','year'],aggfunc=sum)#选取3年作为样本subtable_1=table_letter.reindex(columns=[1910,1960,2010],level='year')#选取近5年作为样本subtable_2=table_letter.reindex(columns=range(2006,2011,1),level='year')#计算百分比letter_prop_1=subtable_1/subtable_1.sum().astype(float)letter_prop_2=subtable_2/subtable_2.sum().astype(float)fix,axes=plt.subplots(2,2,figsize=(12,8))     #定义分图(row,column)letter_prop_1['M'].plot(kind='bar',rot=0,ax=axes[0,0],title='Male')letter_prop_1['F'].plot(kind='bar',rot=0,ax=axes[1,0],title='Female',legend=False)letter_prop_2['M'].plot(kind='bar',rot=0,ax=axes[0,1],title='Male')letter_prop_2['F'].plot(kind='bar',rot=0,ax=axes[1,1],title='Female',legend=False)

145717_EckK_2718942.png

#以d,n,y结尾的趋势%matplotlib inlineletter_prop=table/table.sum().astype(float)dny_ts=letter_prop.ix[['d','n','y'],'M'].T      #选择列并转置dny_ts.plot()

145714_7aUf_2718942.png

#中性名字的变化趋势%matplotlib inlineall_names=top1000.name.unique()     #名字唯一值#以lesl开头,Lesley,Leslie..mask=np.array(['lesl' in x.lower() for x in all_names])  #返回T/Flesley_like=all_names[mask]filtered=top1000[top1000.name.isin(lesley_like)]table_lesl=filtered.pivot_table('births',index='year',columns='sex',aggfunc=sum)table_lesl=table_lesl.div(table_lesl.sum(1),axis=0)table_lesl.plot()

145718_mhPd_2718942.png

 

转载于:https://my.oschina.net/tedzheng/blog/828415

你可能感兴趣的文章
百晓生带你玩转linux系统服务搭建系列----DNS服务的搭建一(正向解析)
查看>>
老男孩50期linux高级运维—决心书
查看>>
记录一次误删数据库指定数据---------->血的教训
查看>>
UML学习---用例图
查看>>
网络营销有哪些有效的营销方式?
查看>>
给大家分享学好 Python 的 11 个优秀资源
查看>>
在古巴买雪茄又不知道怎么选?这里为你推荐几款古巴雪茄中的极品
查看>>
跨过Nginx上基于uWSGI部署Django项目的坑
查看>>
10.开机启动脚本,用户文件含义《Mr.Robot》
查看>>
我的友情链接
查看>>
Xen虚拟机克隆并解决网卡失效的问题
查看>>
python字符串格式化
查看>>
管理员必备的20个Linux系统监控工具
查看>>
debian squeeze 6.0 安装 virtualbox AMD64架构
查看>>
asp生成Html源码
查看>>
Flex BlazeDS Java 使用
查看>>
win7反复自动关机
查看>>
xshell自动保存用户名后的更改方法
查看>>
JBoss Web和 Tomcat的区别
查看>>
leetCode 235. Lowest Common Ancestor of a Binary Search Tree 二排序树问题
查看>>