import pandas as pd import numpy as np from pandas import Series,DataFrame #df1 = DataFrame({'城市':["北京","上海","广州"],'人口':[1000,2000,1500]}) # print(df1) # 城市 人口 # 0 北京 1000 # 1 上海 2000 # 2 广州 1500 #方法1: # df1["GDP"] = Series([1000,2000,1500]) # print(df1) # 城市 人口 GDP # 0 北京 1000 1000 # 1 上海 2000 2000 # 2 广州 1500 1500 #方法2: # df2 = DataFrame({'城市':["北京","上海","广州"],'人口':[1000,2000,1500]},index=["A","B","C"]) # gdp_map = {"北京":1000,"上海":2000,"广州":1500} # df2["GDP"] = df2["城市"].map(gdp_map) # print(df2) # 城市 人口 GDP # A 北京 1000 1000 # B 上海 2000 2000 # C 广州 1500 1500 # df3 = DataFrame({'城市':["北京","上海","广州"],'人口':[1000,2000,1500]},index=["A","B","C"])#不在是默认index时,需要使用指定添加 # df3["GDP"] = Series([1000,2000,1500]) # print(df3)#无法填充进去 # 城市 人口 GDP # A 北京 1000 NaN # B 上海 2000 NaN # C 广州 1500 NaN # df3["GDP"] = Series([1000,2000,1500],index=["A","B","C"]) # print(df3) # 城市 人口 GDP # A 北京 1000 1000 # B 上海 2000 2000 # C 广州 1500 1500 #----------------------------- #replace in Series #s1 = Series(np.arange(10)) #print(s1) # 0 0 # 1 1 # 2 2 # 3 3 # 4 4 # 5 5 # 6 6 # 7 7 # 8 8 # 9 9 # dtype: int32 #print(s1.replace(1,np.nan)) # 0 0.0 # 1 NaN # 2 2.0 # 3 3.0 # 4 4.0 # 5 5.0 # 6 6.0 # 7 7.0 # 8 8.0 # 9 9.0 # dtype: float64 #print(s1.replace([1,2,3],[10,20,30])) # 0 0 # 1 10 # 2 20 # 3 30 # 4 4 # 5 5 # 6 6 # 7 7 # 8 8 # 9 9 # dtype: int64 #----------------------- #Series 和 DataFrame的简单数学运算 # s1 = Series([1,2,3],index=["A","B","C"]) # s2 = Series([4,5,6,7],index=["B","C","D","E"]) # print(s1) # A 1 # B 2 # C 3 # dtype: int64 # print(s2) # B 4 # C 5 # D 6 # E 7 # dtype: int64 # print(s1+s2) # A NaN # B 6.0 # C 8.0 # D NaN # E NaN # dtype: float64 #DataFrame的运算 # df1 = DataFrame(np.arange(4).reshape(2,2),index=["A","B"],columns=["BJ","SH"]) # print(df1) # BJ SH # A 0 1 # B 2 3 # df2 = DataFrame(np.arange(9).reshape(3,3),index=["A","B","C"],columns=["BJ","SH","GZ"]) # print(df2) # BJ SH GZ # A 0 1 2 # B 3 4 5 # C 6 7 8 # print(df1+df2) # BJ GZ SH # A 0.0 NaN 2.0 # B 5.0 NaN 7.0 # C NaN NaN NaN # df3 = DataFrame([[1,2,3],[4,5,np.nan],[7,8,9]],index=["A","B","C"],columns=["c1","c2","c3"]) # print(df3) # c1 c2 c3 # A 1 2 3.0 # B 4 5 NaN # C 7 8 9.0 # print(df3.sum()) # c1 12.0 # c2 15.0 # c3 12.0 # dtype: float64 # print(df3.sum(axis=1)) # A 6.0 # B 9.0 # C 24.0 # dtype: float64 # print(df3.min()) # c1 1.0 # c2 2.0 # c3 3.0 # dtype: float64 # print(df3.max()) # c1 7.0 # c2 8.0 # c3 9.0 # dtype: float64 # print(df3.describe()) # c1 c2 c3 # count 3.0 3.0 2.000000 # mean 4.0 5.0 6.000000 # std 3.0 3.0 4.242641 # min 1.0 2.0 3.000000 # 25% 2.5 3.5 4.500000 # 50% 4.0 5.0 6.000000 # 75% 5.5 6.5 7.500000 # max 7.0 8.0 9.000000 #----------------------------- #Series和DataFrame的排序 # s1 = Series(np.random.randn(10)) # print(s1) # 0 -1.745069 # 1 -3.339463 # 2 2.245615 # 3 0.201136 # 4 -0.115314 # 5 -0.425709 # 6 -1.037263 # 7 0.015670 # 8 -0.514211 # 9 -0.122862 # dtype: float64 # print(s1.values) # [-0.46066427 -0.01673619 -0.79758999 -0.99447067 -1.2554336 0.95775716 # -0.98716949 0.81775325 -0.95819146 -0.38062781] #print(s1.index)#RangeIndex(start=0, stop=10, step=1) # s2 = s1.sort_values() # print(s2) # 3 -1.533961 # 1 -0.777431 # 5 -0.587565 # 2 -0.463069 # 7 -0.257701 # 0 -0.037266 # 6 0.062657 # 9 0.149767 # 8 0.245388 # 4 2.024740 # dtype: float64 # s2 = s1.sort_values(ascending=False) # print(s2) # 1 1.905997 # 6 0.369854 # 0 0.346478 # 2 0.283084 # 3 0.152866 # 4 0.145149 # 5 -0.362064 # 8 -0.627749 # 7 -0.738645 # 9 -0.905832 # dtype: float64 # print(s2.sort_index()) # 0 0.250688 # 1 -0.005753 # 2 0.818747 # 3 1.074309 # 4 0.057101 # 5 -1.576862 # 6 -1.358057 # 7 -0.774541 # 8 1.260600 # 9 0.028084 # dtype: float64 #DataFrame的排序 df1 = DataFrame(np.random.randn(40).reshape(8,5),columns=["A","B","C","D","E"]) print(df1) # A B C D E # 0 1.301407 0.079596 -0.324598 -0.489004 -0.319954 # 1 1.627349 -1.848241 -1.535149 0.616749 -0.581343 # 2 -1.599599 0.177486 0.413103 -0.121707 -0.771692 # 3 -0.346563 2.376872 -0.299881 -0.038205 -1.101628 # 4 2.000585 -0.087473 1.679934 -1.520698 -0.037990 # 5 -0.622608 0.178647 0.511137 0.001924 1.104219 # 6 0.680216 0.616194 0.492893 -1.495716 -2.129312 # 7 0.769310 -0.425242 0.270568 -1.340633 -0.507089 print(df1["A"].sort_values()) # 0 -0.781176 # 5 -0.699767 # 4 -0.257146 # 6 -0.168928 # 2 -0.160794 # 1 0.348743 # 3 1.015523 # 7 1.750817 # Name: A, dtype: float64 print(df1.sort_values("A")) # A B C D E # 7 -1.667484 1.052349 -0.786262 1.515977 -1.663600 # 1 -0.755957 -0.748133 -0.078783 1.221847 1.087867 # 0 -0.624164 -0.225844 0.146987 0.209596 -1.327463 # 5 -0.362764 0.958340 0.580041 -1.062712 0.233652 # 4 -0.184361 0.924434 0.304635 1.863528 0.775122 # 6 0.406105 0.030612 -1.115804 2.543703 -0.234756 # 3 0.657304 1.464882 0.091570 -1.226326 -1.272059 # 2 2.092520 -0.210072 -0.693642 0.152570 0.659520 df2 = df1.sort_values("A") print(df2.sort_index())#跟df1一样的 # A B C D E # 0 -0.612644 -0.795620 1.621510 -1.316650 1.504513 # 1 -1.770057 -1.740721 2.078625 -1.738596 0.345799 # 2 0.697535 1.126456 0.591017 0.272984 1.004823 # 3 1.323213 0.630537 1.063169 -0.682980 0.630861 # 4 0.292257 -0.683437 -2.204945 -0.997271 0.535046 # 5 1.441142 0.637664 0.801728 -0.249832 2.079914 # 6 -0.647377 0.078151 -0.649099 -0.360512 0.692393 # 7 0.333072 1.713874 0.672938 0.130204 -1.050239