Shingrace的个人博客分享 http://blog.sciencenet.cn/u/Shingrace

博文

Pandas--basis

已有 1902 次阅读 2017-5-30 22:38 |个人分类:python|系统分类:科研笔记


###   pandas

import pandas as pd

import numpy as np


s     = pd.Series([1,3,6,np.nan,44,1])

dates = pd.date_range("20170101",periods=6)

df    = pd.DataFrame(np.random.randn(6,4),index = dates,columns=["a","b","c","d"])

#index 定义行名字

#columns 定义列名字

df.dtypes

df.index

df.columns

df.values

df.describe()

df.T


df.sort_index(axis=1,ascending=False)  #排序

df.sort_values(by="a")                 #对列中的数据大小进行排序


################################################

dates = pd.date_range("20130101",periods=6)

df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=["A","B","C","D"])

print(df["A"],df.A)

print(df[0:3])

print(df["20130102":"20130104"])


#select by label: loc

print(df.loc["20130102"])

print(df.loc[:,["A","B"]])

print(df.loc["20130102":,["A","B"]])



#select by position: iloc

print(df.iloc[3:5,1:3])



#select by label and position

# mixed selection: ix


print(df.ix[:3,["A","C"]])



# Boolean indexing   --筛选

print(df)

print(df[df.A>8])





#=============================================================

dates = pd.date_range("20130101",periods=6)

df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=["A","B","C","D"])


df.iloc[2,2] = 1111



df.loc["20130101","B"] = 2222


df[df.A>4] = 0

df.A[df.A>4] = 0

df.B[df.A>4] = 0

df["F"] = np.nan #定义新的一列

df["E"] = pd.Series([1,2,3,4,5,6],index=pd.date_range("20130101",periods = 6))



df.dropna(axis=0,how="any") #how{"any","all"} 丢掉nan行

df.dropna(axis=1,how="any") #how{"any","all"} 丢掉nan列


df.fillna(value=0)          #把nan填为0




df.isnull()                 #丢失数据布尔值

np.any(df.isnull()) == True

#--------------------------------



#data = pd.read_csv("student.csv")#也可以读取txt文件

#保存

#data.to_pickle("student.pickle")


#合并多个dataframe

#concatenating

df1 = pd.DataFrame(np.ones((3,4))*0,columns = ["a","b","c","d"])

df2 = pd.DataFrame(np.ones((3,4))*1,columns = ["a","b","c","d"])

df3 = pd.DataFrame(np.ones((3,4))*2,columns = ["a","b","c","d"])


res = pd.concat([df1,df2,df3],axis=0) #竖向的合并

print(res)

res = pd.concat([df1,df2,df3],axis=0,ignore_index = True) #竖向的合并

#--------------------------------------

#join, ["inner","outer"]

df1 = pd.DataFrame(np.ones((3,4))*0,columns = ["a","b","c","d"],index=[1,2,3])

df2 = pd.DataFrame(np.ones((3,4))*1,columns = ["b","c","d","e"],index=[2,3,4])

print(df1)

print(df2)

res = pd.concat([df1,df2])

res = pd.concat([df1,df2],join="outer")# 默认

res = pd.concat([df1,df2],join="inner")# 寻找相同的部分保留

res = pd.concat([df1,df2],join="inner",ignore_index=True)

#join_axes

df1 = pd.DataFrame(np.ones((3,4))*0,columns = ["a","b","c","d"],index=[1,2,3])

df2 = pd.DataFrame(np.ones((3,4))*1,columns = ["b","c","d","e"],index=[2,3,4])

res = pd.concat([df1,df2],axis=1,join_axes = [df1.index]) #按照df1的index


s1 = pd.Series([1,2,3,4],index=["a","b","c","d"])

res = df1.append(s1,ignore_index=True)


res = df1.append([df1,df2,df3])

#------------

df1 = pd.DataFrame(np.ones((3,4))*0,columns = ["a","b","c","d"])

s1 = pd.Series([1,2,3,4],index=["a","b","c","d"])

res = df1.append(s1,ignore_index= True) #添加一行数据


#------合并dataframe

from __future__ import print_function

import pandas as pd


# merging two df by key/keys. (may be used in database)

# simple example

left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],

                                 'A': ['A0', 'A1', 'A2', 'A3'],

                                 'B': ['B0', 'B1', 'B2', 'B3']})

right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],

                                   'C': ['C0', 'C1', 'C2', 'C3'],

                                   'D': ['D0', 'D1', 'D2', 'D3']})

print(left)

print(right)

res = pd.merge(left, right, on='key')

print(res)


# consider two keys

left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],

                            'key2': ['K0', 'K1', 'K0', 'K1'],

                            'A': ['A0', 'A1', 'A2', 'A3'],

                            'B': ['B0', 'B1', 'B2', 'B3']})

right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],

                             'key2': ['K0', 'K0', 'K0', 'K0'],

                             'C': ['C0', 'C1', 'C2', 'C3'],

                             'D': ['D0', 'D1', 'D2', 'D3']})

print(left)

print(right)

res = pd.merge(left, right, on=['key1', 'key2'], how='inner')  # default for how='inner'

# how = ['left', 'right', 'outer', 'inner']

res = pd.merge(left, right, on=['key1', 'key2'], how='left')

print(res)


# indicator

df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})

df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})

print(df1)

print(df2)

res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)

# give the indicator a custom name

res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')



# merged by index

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],

                                 'B': ['B0', 'B1', 'B2']},

                                 index=['K0', 'K1', 'K2'])

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],

                                    'D': ['D0', 'D2', 'D3']},

                                     index=['K0', 'K2', 'K3'])

print(left)

print(right)

# left_index and right_index

res = pd.merge(left, right, left_index=True, right_index=True, how='outer')

res = pd.merge(left, right, left_index=True, right_index=True, how='inner')


# handle overlapping

boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})

girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})

res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')

print(res)


# join function in pandas is similar with merge. If know merge, you will understand join

#-====================================================



from __future__ import print_function

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt


# plot data


# Series

data = pd.Series(np.random.randn(1000), index=np.arange(1000))

data = data.cumsum()

##data.plot()


# DataFrame

data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))

data = data.cumsum()

# plot methods:

# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'

ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")

data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)


plt.show()




https://blog.sciencenet.cn/blog-669638-1058062.html

上一篇:Numpy-basis
下一篇:Kivy-- [CRITICAL] [App] Unable to get a Window, abort.
收藏 IP: 36.102.227.*| 热度|

0

该博文允许注册用户评论 请点击登录 评论 (0 个评论)

数据加载中...

Archiver|手机版|科学网 ( 京ICP备07017567号-12 )

GMT+8, 2024-4-19 07:08

Powered by ScienceNet.cn

Copyright © 2007- 中国科学报社

返回顶部