需求:一个诊断名称对应多个诊断编码,取出现次数最多的那一个。
import pandas as pd df = pd.Dataframe([['糖尿病','1'],['糖尿病','2'],['糖尿病','3'],['糖尿病','1'],['糖尿病',''],['糖尿病',''], ['高血压','1'],['高血压','2'],['高血压','3'],['高血压','1'],['高血压',''],['高血压',''] ],columns=['diag_name','code']) sheet_rows = df.shape[0] result_file = './result/diag_stand.csv' #定义Dataframe列名 df_col_names = ['diag_name','code'] #定义1个Dataframe df_result = pd.Dataframe(columns=df_col_names) #1.排掉"code"列为空的数据 for row in range(sheet_rows): if df.iloc[row, 1] == '': continue else: diag_name = df.iloc[row, 0] #行 列 code = df.iloc[row, 1] df_temp = pd.Dataframe([[diag_name,code]],columns=df_col_names) df_result = df_result.append(df_temp, ignore_index=True) #2.按照diag_name,code分组,统计出现的次数,添加count列,按照diag_name,count分组,降序 gp = df_result.groupby(by=['diag_name','code']).size().reset_index(name='count').sort_values(by=['diag_name','count'],ascending=(False,False)) # print(gp) #head(1) 每一组取第1条 gp2 = gp.groupby('diag_name').head(1).sort_values(by=['count'],ascending=(False)) gp2.to_csv(result_file,index=False)
处理前数据
处理后数据
reference
1.Python技巧之对Dataframe进行多列排序
https://blog.csdn.net/m0_37637511/article/details/79901071