pandas删除重复数据

参考:

import numpy as np
import pandas as pd

data = pd.DataFrame(
    [[2, 3, np.nan],
     [1, 2, 3],
     [2, 3, 4],
     [1, 2, 3]],
columns = list('abc')
)

print(data)

#    a  b    c
# 0  2  3  NaN
# 1  1  2  3.0
# 2  2  3  4.0
# 3  1  2  3.0

print(data.duplicated())
# 0    False
# 1    False
# 2    False
# 3     True
# dtype: bool

print(data.duplicated(subset=['a', 'b']))
# 0    False
# 1    False
# 2     True
# 3     True
# dtype: bool

print(data.drop_duplicates())
#    a  b    c
# 0  2  3  NaN
# 1  1  2  3.0
# 2  2  3  4.0

print(data.drop_duplicates(subset=['a','b']))
#    a  b    c
# 0  2  3  NaN
# 1  1  2  3.0

#让NaN数据排到后面,保留非NaN的数据
data.sort_values(by='c',inplace=True)
print(data)
#    a  b    c
# 1  1  2  3.0
# 3  1  2  3.0
# 2  2  3  4.0
# 0  2  3  NaN

print(data.drop_duplicates(subset=['a','b']))
#    a  b    c
# 1  1  2  3.0
# 2  2  3  4.0

标签: nan、3.0、print、4.0、subset、面试
  • 回复
隐藏