||
### pandas
import pandas as pd
import numpy as np
s = pd.Series([1,3,6,np.nan,44,1])
dates = pd.date_range("20170101",periods=6)
df = pd.DataFrame(np.random.randn(6,4),index = dates,columns=["a","b","c","d"])
#index 定义行名字
#columns 定义列名字
df.dtypes
df.index
df.columns
df.values
df.describe()
df.T
df.sort_index(axis=1,ascending=False) #排序
df.sort_values(by="a") #对列中的数据大小进行排序
################################################
dates = pd.date_range("20130101",periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=["A","B","C","D"])
print(df["A"],df.A)
print(df[0:3])
print(df["20130102":"20130104"])
#select by label: loc
print(df.loc["20130102"])
print(df.loc[:,["A","B"]])
print(df.loc["20130102":,["A","B"]])
#select by position: iloc
print(df.iloc[3:5,1:3])
#select by label and position
# mixed selection: ix
print(df.ix[:3,["A","C"]])
# Boolean indexing --筛选
print(df)
print(df[df.A>8])
#=============================================================
dates = pd.date_range("20130101",periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=["A","B","C","D"])
df.iloc[2,2] = 1111
df.loc["20130101","B"] = 2222
df[df.A>4] = 0
df.A[df.A>4] = 0
df.B[df.A>4] = 0
df["F"] = np.nan #定义新的一列
df["E"] = pd.Series([1,2,3,4,5,6],index=pd.date_range("20130101",periods = 6))
df.dropna(axis=0,how="any") #how{"any","all"} 丢掉nan行
df.dropna(axis=1,how="any") #how{"any","all"} 丢掉nan列
df.fillna(value=0) #把nan填为0
df.isnull() #丢失数据布尔值
np.any(df.isnull()) == True
#--------------------------------
#data = pd.read_csv("student.csv")#也可以读取txt文件
#保存
#data.to_pickle("student.pickle")
#合并多个dataframe
#concatenating
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ["a","b","c","d"])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ["a","b","c","d"])
df3 = pd.DataFrame(np.ones((3,4))*2,columns = ["a","b","c","d"])
res = pd.concat([df1,df2,df3],axis=0) #竖向的合并
print(res)
res = pd.concat([df1,df2,df3],axis=0,ignore_index = True) #竖向的合并
#--------------------------------------
#join, ["inner","outer"]
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ["a","b","c","d"],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ["b","c","d","e"],index=[2,3,4])
print(df1)
print(df2)
res = pd.concat([df1,df2])
res = pd.concat([df1,df2],join="outer")# 默认
res = pd.concat([df1,df2],join="inner")# 寻找相同的部分保留
res = pd.concat([df1,df2],join="inner",ignore_index=True)
#join_axes
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ["a","b","c","d"],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ["b","c","d","e"],index=[2,3,4])
res = pd.concat([df1,df2],axis=1,join_axes = [df1.index]) #按照df1的index
s1 = pd.Series([1,2,3,4],index=["a","b","c","d"])
res = df1.append(s1,ignore_index=True)
res = df1.append([df1,df2,df3])
#------------
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ["a","b","c","d"])
s1 = pd.Series([1,2,3,4],index=["a","b","c","d"])
res = df1.append(s1,ignore_index= True) #添加一行数据
#------合并dataframe
from __future__ import print_function
import pandas as pd
# merging two df by key/keys. (may be used in database)
# simple example
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
res = pd.merge(left, right, on='key')
print(res)
# consider two keys
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
res = pd.merge(left, right, on=['key1', 'key2'], how='inner') # default for how='inner'
# how = ['left', 'right', 'outer', 'inner']
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
# indicator
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
print(df2)
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
# give the indicator a custom name
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
# merged by index
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
print(right)
# left_index and right_index
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
# handle overlapping
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
# join function in pandas is similar with merge. If know merge, you will understand join
#-====================================================
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# plot data
# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()
##data.plot()
# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()
# plot methods:
# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)
plt.show()
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-4-19 07:08
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社