Talk is cheap,show you the code.
import json
from pandas import DataFrame,Series
import pandas as pd
#import numpy as np
import pylab
###########
path = 'C:/Users/zydsb/Desktop/pydata-book-master/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]
##########3
time_zones=[rec['tz'] for rec in records if 'tz' in rec]
#use Python to count
def get_counts(sequence):
counts ={}
i = 0
for j in sequence:
if j in counts:
counts[j] += 1 #ads the elments to the dict
i += 1
else: counts[j] = 1
# print type(counts)
return counts
#num = get_counts(time_zones) # do not count the num ,just give count{} the elements
#print num #
num=get_counts(time_zones)
print num['America/New_York'] #the elements you added is dict ,so the function is count numbers
def top_counts(count_dict,n =10):
value_key_pairs = [(count,tz) for tz ,count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
print top_counts(num)
###########
# use pandas
frame = DataFrame(records)
#print frame
#print frame['tz'][:10]
tz_counts = frame['tz'].value_counts()
tz_num = tz_counts[:10]
print tz_num
###########
#plot
clean_tz = frame['tz'].fillna("Missing")
clean_tz [clean_tz == ""] = "Unknown"
tz_counts = clean_tz.value_counts()
tz_counts01 = tz_counts[:10]
print tz_counts01
tz_counts[:10].plot(kind = "barh",rot=0)
有些代碼在.py文件中跑是不支持的!!在命令行環境寫是可行的
如下
%run "c:\users\zydsb\appdata\local\temp\tmpvrjjhh.py" c:\users\zydsb\appdata\local\temp\tmpvrjjhh.py:8: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators; you can avoid this warning by specifying engine='python'. user = pd.read_table('C:/Users/zydsb/Desktop/pydata-book-master/ch02/movielens/users.dat',sep="::",header = None,names = unames)
- 在IPython中,ctrl+p可以前向搜索之間鍵入的命令
- 符號
_
會保存之前的函數結果,以預防出現函數結果沒有賦值給變量的情況 -
%magic
魔法命令幫助文檔 ,以后應該會用 -
tab
自動完成,防止忘記變量,函數或方法之類
# -*- coding: utf-8 -*-
#MovieLens
import pandas as pd
import json
from pandas import DataFrame,Series
path_user = 'C:/Users/zydsb/Desktop/pydata-book-master/ch02/movielens/users.dat'
path_movie = "C:/Users/zydsb/Desktop/pydata-book-master/ch02/movielens/movies.dat"
path_rating = "C:/Users/zydsb/Desktop/pydata-book-master/ch02/movielens/rating.dat"
#unames = ["user_id","gender","age","occa","zip"]
#user = pd.read_table('C:/Users/zydsb/Desktop/pydata-book-master/ch02/movielens/users.dat',sep="::",header = None,names = unames)
#rname = ["userz_id","movie_id","rating","times"]
#rating = pd.read_table("path_rating",sep = "::",header = None, names = rname )
#####Baby names
path = 'C:/Users/zydsb/Desktop/pydata-book-master/ch02/names/yob2000.txt'
records = [line for line in open(path)]
frame = DataFrame(records)
names_2000 = pd.read_csv(frame,names=['name','sex','birth']) #此處代碼和書上有出入,但是效果一樣
names_2000_group = names_2000.groupby('sex').birth.sum()