参考:
import numpy as np import pandas as pd data = pd.DataFrame( [[2, 3, np.nan], [1, 2, 3], [2, 3, 4], [1, 2, 3]], columns = list('abc') ) print(data) # a b c # 0 2 3 NaN # 1 1 2 3.0 # 2 2 3 4.0 # 3 1 2 3.0 print(data.duplicated()) # 0 False # 1 False # 2 False # 3 True # dtype: bool print(data.duplicated(subset=['a', 'b'])) # 0 False # 1 False # 2 True # 3 True # dtype: bool print(data.drop_duplicates()) # a b c # 0 2 3 NaN # 1 1 2 3.0 # 2 2 3 4.0 print(data.drop_duplicates(subset=['a','b'])) # a b c # 0 2 3 NaN # 1 1 2 3.0 #让NaN数据排到后面,保留非NaN的数据 data.sort_values(by='c',inplace=True) print(data) # a b c # 1 1 2 3.0 # 3 1 2 3.0 # 2 2 3 4.0 # 0 2 3 NaN print(data.drop_duplicates(subset=['a','b'])) # a b c # 1 1 2 3.0 # 2 2 3 4.0