Pandas库的使用

发布于 2019-08-04  119 次阅读


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
s = pd.Series([1,3,6,np.nan,44,1])
print(s)
'''
0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64
'''
dates = pd.date_range('20160101',periods=6)
print(dates)
'''
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')
'''
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
'''
                   a         b         c         d
2016-01-01  0.384166  0.243069 -0.103707  0.627495
2016-01-02  0.010461  0.677476 -1.463502  1.301071
2016-01-03 -1.167638 -0.977465 -0.790679 -1.024166
2016-01-04  1.816999  0.967527  0.209683 -0.122595
2016-01-05 -0.257573 -1.516078  0.728106  1.948239
2016-01-06 -0.402276  0.400930  0.894255 -2.427162
'''
df2 = pd.DataFrame({
    'A':1.,
    'B':pd.Timestamp('20130102'),
    'C':pd.Series(1,index=list(range(4)),dtype='float32'),
    'D':np.array([3]*4,dtype='int32'),
    'E':'foo',
    'F':np.arange(4)
})
print(df2)
'''
     A          B    C  D    E
0  1.0 2013-01-02  1.0  3  foo
1  1.0 2013-01-02  1.0  3  foo
2  1.0 2013-01-02  1.0  3  foo
3  1.0 2013-01-02  1.0  3  foo
'''
print(df2.dtypes)
print(df2.index)
'''
Int64Index([0, 1, 2, 3], dtype='int64')
'''
print(df2.columns)
"""
Index(['A', 'B', 'C', 'D', 'E'], dtype='object')
"""
print(df2.values)
'''
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'foo']]
'''
print(df2.describe())
'''
         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0
'''
print(df2.T)
print(df2.sort_index(axis=1,ascending=False))
'''
     E  D    C          B    A
0  foo  3  1.0 2013-01-02  1.0
1  foo  3  1.0 2013-01-02  1.0
2  foo  3  1.0 2013-01-02  1.0
3  foo  3  1.0 2013-01-02  1.0
'''
print(df2.sort_values(by='F',ascending=False))
'''
     A          B    C  D    E  F
3  1.0 2013-01-02  1.0  3  foo  3
2  1.0 2013-01-02  1.0  3  foo  2
1  1.0 2013-01-02  1.0  3  foo  1
0  1.0 2013-01-02  1.0  3  foo  0
'''
dates = pd.date_range('20160101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df['A'],df.A)
print(df[0:3]) #列选择
print(df.loc['20160102'])
print(df.loc[:,['A','B']]) #行选择
'''
             A   B
2016-01-01   0   1
2016-01-02   4   5
2016-01-03   8   9
2016-01-04  12  13
2016-01-05  16  17
2016-01-06  20  21
'''
print(df.iloc[1:3][1:4])
print(df.iloc[[1,4]][1:3])
print(df[df.A<8])
df.iloc[2,2] = 111 #修改值
df.loc['20160102','B'] = 111 #修改值
df.B[df.A>4] = 0 #修改值
df['F'] = np.nan
# 添加列  pd.Series  index需要对应好
df['E'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20160101',periods=6))

df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
print(df.dropna(axis=1,how='any')) # 0 是丢掉列
'''
             A   D  E
2016-01-01   0   3  1
2016-01-02   4   7  2
2016-01-03   8  11  3
2016-01-04  12  15  4
2016-01-05  16  19  5
2016-01-06  20  23  6
'''
print(df.fillna(value=0)) #填充
print(df.isna()) #返回 true 与 false 的矩阵  丢失的为true
print(np.any(df.isnull()) == True) # 至少有一个是等于true的就是true
#concatenating 合并
# pd.concat([pd1,pd2,pd3],axis=0,ignore_index=True)

# merge 合并
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()
data = pd.DataFrame(np.random.randn(1000,4),
                    index=np.arange(1000),
                    columns=list("ABCD"))
print(data)
data = data.cumsum()
print(data)
# plot methods:
# 'bar','hise','box','kde','area','scatter','hexbin','pie'
ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class 1')
data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class 2',ax=ax)
plt.show()

像烟花也是过一生,像樱花也是一生,只要亮过和盛开不就好了么?