数据整理
 - 定义 
  
   - 在数据清洗过程中,很多时候需要将不同的数据整理在一起,方便后续的分析,这个过程也叫数据合并
 
  
  
 - 合并方法 
  
   - 常见的合并方法有堆叠和按主键进行合并,堆叠又分为横向堆叠和纵向堆叠,按主键合并类似于sql里面的关联操作
 
  
  
 - 横向堆叠将两张表或多张表在X轴方向,即横向拼接在一起
 
 - 纵向堆叠将两张表或多张表在Y轴方向,即纵向拼接在一起
 
 - 注意join 取inner或者outer,分别代表交集和并集
 
import xlrd
import os
import pandas as pd
import numpy as np
  
 
os.getcwd()
  
 
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据表处理'
  
 
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
  
 
workbook = xlrd.open_workbook('meal_order_detail.xlsx')
  
 
sheet_name = workbook.sheet_names()
  
 
sheet_name
  
 
['meal_order_detail1', 'meal_order_detail2', 'meal_order_detail3']
  
 
order1 = pd.read_excel('meal_order_detail.xlsx', sheet_name='meal_order_detail1')
order2 = pd.read_excel('meal_order_detail.xlsx', sheet_name='meal_order_detail2')
order3 = pd.read_excel('meal_order_detail.xlsx', sheet_name='meal_order_detail3')
  
 
order = pd.concat([order1, order2, order3], axis=0, ignore_index=True)
  
 
order.tail(5)
  
 
 
 
  
   
     | 
    detail_id | 
    order_id | 
    dishes_id | 
    logicprn_name | 
    parent_class_name | 
    dishes_name | 
    itemis_add | 
    counts | 
    amounts | 
    cost | 
    place_order_time | 
    discount_amt | 
    discount_reason | 
    kick_back | 
    add_inprice | 
    add_info | 
    bar_code | 
    picture_file | 
    emp_id | 
   
  
  
   
    | 10032 | 
    5683 | 
    672 | 
    610049 | 
    NaN | 
    NaN | 
    爆炒双丝 | 
    0 | 
    1 | 
    35 | 
    NaN | 
    2016-08-31 21:53:30 | 
    NaN | 
    NaN | 
    NaN | 
    0 | 
    NaN | 
    NaN | 
    caipu/301003.jpg | 
    1089 | 
   
   
    | 10033 | 
    5686 | 
    672 | 
    609959 | 
    NaN | 
    NaN | 
    小炒羊腰\r\n\r\n\r\n | 
    0 | 
    1 | 
    36 | 
    NaN | 
    2016-08-31 21:54:40 | 
    NaN | 
    NaN | 
    NaN | 
    0 | 
    NaN | 
    NaN | 
    caipu/202005.jpg | 
    1089 | 
   
   
    | 10034 | 
    5379 | 
    647 | 
    610012 | 
    NaN | 
    NaN | 
    香菇鹌鹑蛋 | 
    0 | 
    1 | 
    39 | 
    NaN | 
    2016-08-31 21:54:44 | 
    NaN | 
    NaN | 
    NaN | 
    0 | 
    NaN | 
    NaN | 
    caipu/302001.jpg | 
    1094 | 
   
   
    | 10035 | 
    5380 | 
    647 | 
    610054 | 
    NaN | 
    NaN | 
    不加一滴油的酸奶蛋糕 | 
    0 | 
    1 | 
    7 | 
    NaN | 
    2016-08-31 21:55:24 | 
    NaN | 
    NaN | 
    NaN | 
    0 | 
    NaN | 
    NaN | 
    caipu/501003.jpg | 
    1094 | 
   
   
    | 10036 | 
    5688 | 
    672 | 
    609953 | 
    NaN | 
    NaN | 
    凉拌菠菜 | 
    0 | 
    1 | 
    27 | 
    NaN | 
    2016-08-31 21:56:54 | 
    NaN | 
    NaN | 
    NaN | 
    0 | 
    NaN | 
    NaN | 
    caipu/303004.jpg | 
    1089 | 
   
  
 
 
basic = pd.DataFrame()
  
 
for i in sheet_name: basic_i = pd.read_excel('meal_order_detail.xlsx', sheet_name=i) basic = pd.concat([basic, basic_i], axis=0, ignore_index=True)
  
 
basic.shape
  
 
(10037, 19)
  
 
basic.tail(5)
  
 
 
 
  
   
     | 
    detail_id | 
    order_id | 
    dishes_id | 
    logicprn_name | 
    parent_class_name | 
    dishes_name | 
    itemis_add | 
    counts | 
    amounts | 
    cost | 
    place_order_time | 
    discount_amt | 
    discount_reason | 
    kick_back | 
    add_inprice | 
    add_info | 
    bar_code | 
    picture_file | 
    emp_id | 
   
  
  
   
    | 10032 | 
    5683 | 
    672 | 
    610049 | 
    NaN | 
    NaN | 
    爆炒双丝 | 
    0 | 
    1 | 
    35 | 
    NaN | 
    2016-08-31 21:53:30 | 
    NaN | 
    NaN | 
    NaN | 
    0 | 
    NaN | 
    NaN | 
    caipu/301003.jpg | 
    1089 | 
   
   
    | 10033 | 
    5686 | 
    672 | 
    609959 | 
    NaN | 
    NaN | 
    小炒羊腰\r\n\r\n\r\n | 
    0 | 
    1 | 
    36 | 
    NaN | 
    2016-08-31 21:54:40 | 
    NaN | 
    NaN | 
    NaN | 
    0 | 
    NaN | 
    NaN | 
    caipu/202005.jpg | 
    1089 | 
   
   
    | 10034 | 
    5379 | 
    647 | 
    610012 | 
    NaN | 
    NaN | 
    香菇鹌鹑蛋 | 
    0 | 
    1 | 
    39 | 
    NaN | 
    2016-08-31 21:54:44 | 
    NaN | 
    NaN | 
    NaN | 
    0 | 
    NaN | 
    NaN | 
    caipu/302001.jpg | 
    1094 | 
   
   
    | 10035 | 
    5380 | 
    647 | 
    610054 | 
    NaN | 
    NaN | 
    不加一滴油的酸奶蛋糕 | 
    0 | 
    1 | 
    7 | 
    NaN | 
    2016-08-31 21:55:24 | 
    NaN | 
    NaN | 
    NaN | 
    0 | 
    NaN | 
    NaN | 
    caipu/501003.jpg | 
    1094 | 
   
   
    | 10036 | 
    5688 | 
    672 | 
    609953 | 
    NaN | 
    NaN | 
    凉拌菠菜 | 
    0 | 
    1 | 
    27 | 
    NaN | 
    2016-08-31 21:56:54 | 
    NaN | 
    NaN | 
    NaN | 
    0 | 
    NaN | 
    NaN | 
    caipu/303004.jpg | 
    1089 | 
   
  
 
 
df = pd.read_csv('baby_trade_history.csv', dtype={'user_id': str})
  
 
df1 = pd.read_csv('sam_tianchi_mum_baby.csv', dtype={'user_id': str})
  
 
df1.head(5)
  
 
 
 
  
   
     | 
    user_id | 
    birthday | 
    gender | 
   
  
  
   
    | 0 | 
    2757 | 
    20130311 | 
    1 | 
   
   
    | 1 | 
    415971 | 
    20121111 | 
    0 | 
   
   
    | 2 | 
    1372572 | 
    20120130 | 
    1 | 
   
   
    | 3 | 
    10339332 | 
    20110910 | 
    0 | 
   
   
    | 4 | 
    10642245 | 
    20130213 | 
    0 | 
   
  
 
 
df.head(5)
  
 
 
 
  
   
     | 
    user_id | 
    auction_id | 
    cat_id | 
    cat1 | 
    property | 
    buy_mount | 
    day | 
   
  
  
   
    | 0 | 
    786295544 | 
    41098319944 | 
    50014866 | 
    50022520 | 
    21458:86755362;13023209:3593274;10984217:21985... | 
    2 | 
    20140919 | 
   
   
    | 1 | 
    532110457 | 
    17916191097 | 
    50011993 | 
    28 | 
    21458:11399317;1628862:3251296;21475:137325;16... | 
    1 | 
    20131011 | 
   
   
    | 2 | 
    249013725 | 
    21896936223 | 
    50012461 | 
    50014815 | 
    21458:30992;1628665:92012;1628665:3233938;1628... | 
    1 | 
    20131011 | 
   
   
    | 3 | 
    917056007 | 
    12515996043 | 
    50018831 | 
    50014815 | 
    21458:15841995;21956:3494076;27000458:59723383... | 
    2 | 
    20141023 | 
   
   
    | 4 | 
    444069173 | 
    20487688075 | 
    50013636 | 
    50008168 | 
    21458:30992;13658074:3323064;1628665:3233941;1... | 
    1 | 
    20141103 | 
   
  
 
 
df2 = pd.merge(left=df, right=df1, how='inner', left_on='user_id', right_on='user_id')
  
 
df2.head(5)
  
 
 
 
  
   
     | 
    user_id | 
    auction_id | 
    cat_id | 
    cat1 | 
    property | 
    buy_mount | 
    day | 
    birthday | 
    gender | 
   
  
  
   
    | 0 | 
    513441334 | 
    19909384116 | 
    50010557 | 
    50008168 | 
    25935:21991;1628665:29784;22019:34731;22019:20... | 
    1 | 
    20121212 | 
    20110105 | 
    1 | 
   
   
    | 1 | 
    377550424 | 
    15771663914 | 
    50015841 | 
    28 | 
    1628665:3233941;1628665:3233942;3914866:11580;... | 
    1 | 
    20121123 | 
    20110620 | 
    1 | 
   
   
    | 2 | 
    47342027 | 
    14066344263 | 
    50013636 | 
    50008168 | 
    21458:21599;13585028:3416646;1628665:3233942;1... | 
    1 | 
    20120911 | 
    20101008 | 
    1 | 
   
   
    | 3 | 
    119784861 | 
    20796936076 | 
    50140021 | 
    50008168 | 
    21458:120325094;22019:2026;22019:34731;22019:3... | 
    1 | 
    20121129 | 
    20120327 | 
    0 | 
   
   
    | 4 | 
    159129426 | 
    15198386301 | 
    50013711 | 
    50008168 | 
    21458:11580;1628665:29778;22019:3340598;22019:... | 
    2 | 
    20120808 | 
    20100825 | 
    0 | 
   
  
 
 文章来源: ruochen.blog.csdn.net,作者:若尘,版权归原作者所有,如需转载,请联系作者。
原文链接:ruochen.blog.csdn.net/article/details/105523421
         
        
        【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
            
cloudbbs@huaweicloud.com 
        
        
        
        
评论(0)