In [1]:
# this section of codes read 3 sheets in an excel file from the same directory
# the two sheets are combined together and same item is added together
# still need to figure out why dtype in read_excel doesn't work properly without any error message
# - dtype didn't read in properly because there are empty rows in the cells (Null / NaN)
# note that when there isn't any Null value, the cells / data types are consistent across multiple sheets
# 11/9/2022
import os
print(os.getcwd())
dirname = 'c:\\MLcourse\\HJ project'
os.chdir(dirname)
print(os.getcwd())
import pandas as pd
import numpy as np
file_name = "Multi_sheet.xlsx"
df = pd.read_excel(file_name, sheet_name=[0,1,2], header=1, usecols="B:E")
# if sheet_name=None, all sheets will be read in as part of the dictionary
# all sheets are stored in the dictionary and can be assessed using key / df[key]
df_add = pd.DataFrame()
print('# of items in df ->', len(df))
for key in df:
print(key, "\n", df[key])
df_add = df_add.append(df[key], ignore_index=True)
# drop Null rows and convert the data types
df_add.dropna(how='all',inplace=True)
df_add = df_add.astype({'Item #':'int64', 'Item description':'str', 'UPC code':'str', 'Quantity':'int64'})
# group items together and add up all the same item # quantity
df_group = df_add.groupby(['Item #','Item description','UPC code']).sum()
print(df_group)
C:\MLCourse\HJ project C:\MLCourse\HJ project # of items in df -> 3 0 Item # Item description UPC code Quantity 0 862082.0 basic car #1 5.813628e+11 4.0 1 582646.0 basic car #2 2.996459e+11 2.0 2 740363.0 basic car #3 4.547111e+11 3.0 3 165209.0 basic car #4 7.560307e+11 1.0 4 745818.0 basic car #5 3.520311e+11 3.0 5 618993.0 basic car #6 2.550729e+11 5.0 6 NaN NaN NaN NaN 7 703883.0 delux car #1 5.642937e+11 3.0 8 173254.0 delux car #2 9.561079e+10 2.0 9 646018.0 delux car #3 8.003565e+11 6.0 10 529847.0 delux car #4 6.777414e+11 3.0 11 56151.0 delux car #5 2.268907e+11 2.0 12 309874.0 delux car #6 5.877168e+11 1.0 1 Item # Item description UPC code Quantity 0 862082.0 basic car #1 5.813628e+11 3.0 1 582646.0 basic car #2 2.996459e+11 5.0 2 740363.0 basic car #3 4.547111e+11 3.0 3 165209.0 basic car #4 7.560307e+11 3.0 4 745818.0 basic car #5 3.520311e+11 6.0 5 618993.0 basic car #6 2.550729e+11 4.0 6 NaN NaN NaN NaN 7 703883.0 delux car #1 5.642937e+11 1.0 8 173254.0 delux car #2 9.561079e+10 3.0 9 646018.0 delux car #3 8.003565e+11 5.0 10 529847.0 delux car #4 6.777414e+11 6.0 11 56151.0 delux car #5 2.268907e+11 7.0 12 309874.0 delux car #6 5.877168e+11 8.0 13 NaN NaN NaN NaN 14 555988.0 special car #1 8.947164e+11 3.0 2 Item # Item description UPC code Quantity 0 56151 delux car #5 226890748623 9 1 165209 basic car #4 756030704398 4 2 173254 delux car #2 95610786143 5 3 309874 delux car #6 587716839953 9 4 529847 delux car #4 677741438376 9 5 555988 special car #1 894716423380 3 6 582646 basic car #2 299645942550 7 7 618993 basic car #6 255072922341 9 8 646018 delux car #3 800356544694 11 9 703883 delux car #1 564293728345 4 10 740363 basic car #3 454711139547 6 11 745818 basic car #5 352031103297 9 12 862082 basic car #1 581362817884 7 Quantity Item # Item description UPC code 56151 delux car #5 226890748623.0 18 165209 basic car #4 756030704398.0 8 173254 delux car #2 95610786143.0 10 309874 delux car #6 587716839953.0 18 529847 delux car #4 677741438376.0 18 555988 special car #1 894716423380.0 6 582646 basic car #2 299645942550.0 14 618993 basic car #6 255072922341.0 18 646018 delux car #3 800356544694.0 22 703883 delux car #1 564293728345.0 8 740363 basic car #3 454711139547.0 12 745818 basic car #5 352031103297.0 18 862082 basic car #1 581362817884.0 14
C:\Users\guesswho\AppData\Local\Temp\ipykernel_21936\421086600.py:26: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_add = df_add.append(df[key], ignore_index=True) C:\Users\guesswho\AppData\Local\Temp\ipykernel_21936\421086600.py:26: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_add = df_add.append(df[key], ignore_index=True) C:\Users\guesswho\AppData\Local\Temp\ipykernel_21936\421086600.py:26: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_add = df_add.append(df[key], ignore_index=True)
In [2]:
# this sections reads all the excel files in the directory and combine them
# get current working directory, then change the current working directory to c:\MLcourse\HJ project\experiement
# import all .xlsx files into df_new and group_by Item #
import pandas as pd
import numpy as np
import os
print(os.getcwd())
dirname = 'c:\\MLcourse\\HJ project\\experiement'
output_file = "combined.xlsx"
os.chdir(dirname)
print(os.getcwd())
ext = ('.xlsx')
df_new = pd.DataFrame()
for files in os.listdir(dirname):
if files.endswith(ext):
print("file name - ", files) # printing file name of desired extension
df_new = df_new.append(pd.read_excel(files, header=1, usecols="B:E"), ignore_index=True)
else:
continue
print(df_new.dtypes)
df_new.dropna(how='all',inplace=True)
df_new = df_new.astype({'Item #':'int64', 'Item description':'str', 'UPC code':'str', 'Quantity':'int64'})
print("converted df_new", '\n', df_new)
print(df_new.dtypes)
df_new_group = df_new.groupby(['Item #','Item description','UPC code']).sum()
print(df_new_group)
C:\MLCourse\HJ project c:\MLcourse\HJ project\experiement file name - single-1.xlsx file name - single-2.xlsx file name - single-3.xlsx Item # float64 Item description object UPC code float64 Quantity float64 dtype: object converted df_new Item # Item description UPC code Quantity 0 862082 basic car #1 581362817884.0 4 1 582646 basic car #2 299645942550.0 2 2 740363 basic car #3 454711139547.0 3 3 165209 basic car #4 756030704398.0 1 4 745818 basic car #5 352031103297.0 3 5 618993 basic car #6 255072922341.0 5 7 703883 delux car #1 564293728345.0 3 8 173254 delux car #2 95610786143.0 2 9 646018 delux car #3 800356544694.0 6 10 529847 delux car #4 677741438376.0 3 11 56151 delux car #5 226890748623.0 2 12 309874 delux car #6 587716839953.0 1 13 862082 basic car #1 581362817884.0 3 14 582646 basic car #2 299645942550.0 5 15 740363 basic car #3 454711139547.0 3 16 165209 basic car #4 756030704398.0 3 17 745818 basic car #5 352031103297.0 6 18 618993 basic car #6 255072922341.0 4 20 703883 delux car #1 564293728345.0 1 21 173254 delux car #2 95610786143.0 3 22 646018 delux car #3 800356544694.0 5 23 529847 delux car #4 677741438376.0 6 24 56151 delux car #5 226890748623.0 7 25 309874 delux car #6 587716839953.0 8 27 555988 special car #1 894716423380.0 3 28 56151 delux car #5 226890748623.0 9 29 165209 basic car #4 756030704398.0 4 30 173254 delux car #2 95610786143.0 5 31 309874 delux car #6 587716839953.0 9 32 529847 delux car #4 677741438376.0 9 33 555988 special car #1 894716423380.0 3 34 582646 basic car #2 299645942550.0 7 35 618993 basic car #6 255072922341.0 9 36 646018 delux car #3 800356544694.0 11 37 703883 delux car #1 564293728345.0 4 38 740363 basic car #3 454711139547.0 6 39 745818 basic car #5 352031103297.0 9 40 862082 basic car #1 581362817884.0 7 Item # int64 Item description object UPC code object Quantity int64 dtype: object Quantity Item # Item description UPC code 56151 delux car #5 226890748623.0 18 165209 basic car #4 756030704398.0 8 173254 delux car #2 95610786143.0 10 309874 delux car #6 587716839953.0 18 529847 delux car #4 677741438376.0 18 555988 special car #1 894716423380.0 6 582646 basic car #2 299645942550.0 14 618993 basic car #6 255072922341.0 18 646018 delux car #3 800356544694.0 22 703883 delux car #1 564293728345.0 8 740363 basic car #3 454711139547.0 12 745818 basic car #5 352031103297.0 18 862082 basic car #1 581362817884.0 14
C:\Users\guesswho\AppData\Local\Temp\ipykernel_21936\2300699762.py:21: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_new = df_new.append(pd.read_excel(files, header=1, usecols="B:E"), ignore_index=True) C:\Users\guesswho\AppData\Local\Temp\ipykernel_21936\2300699762.py:21: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_new = df_new.append(pd.read_excel(files, header=1, usecols="B:E"), ignore_index=True) C:\Users\guesswho\AppData\Local\Temp\ipykernel_21936\2300699762.py:21: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_new = df_new.append(pd.read_excel(files, header=1, usecols="B:E"), ignore_index=True)
In [3]:
df_new
Out[3]:
Item # | Item description | UPC code | Quantity | |
---|---|---|---|---|
0 | 862082 | basic car #1 | 581362817884.0 | 4 |
1 | 582646 | basic car #2 | 299645942550.0 | 2 |
2 | 740363 | basic car #3 | 454711139547.0 | 3 |
3 | 165209 | basic car #4 | 756030704398.0 | 1 |
4 | 745818 | basic car #5 | 352031103297.0 | 3 |
5 | 618993 | basic car #6 | 255072922341.0 | 5 |
7 | 703883 | delux car #1 | 564293728345.0 | 3 |
8 | 173254 | delux car #2 | 95610786143.0 | 2 |
9 | 646018 | delux car #3 | 800356544694.0 | 6 |
10 | 529847 | delux car #4 | 677741438376.0 | 3 |
11 | 56151 | delux car #5 | 226890748623.0 | 2 |
12 | 309874 | delux car #6 | 587716839953.0 | 1 |
13 | 862082 | basic car #1 | 581362817884.0 | 3 |
14 | 582646 | basic car #2 | 299645942550.0 | 5 |
15 | 740363 | basic car #3 | 454711139547.0 | 3 |
16 | 165209 | basic car #4 | 756030704398.0 | 3 |
17 | 745818 | basic car #5 | 352031103297.0 | 6 |
18 | 618993 | basic car #6 | 255072922341.0 | 4 |
20 | 703883 | delux car #1 | 564293728345.0 | 1 |
21 | 173254 | delux car #2 | 95610786143.0 | 3 |
22 | 646018 | delux car #3 | 800356544694.0 | 5 |
23 | 529847 | delux car #4 | 677741438376.0 | 6 |
24 | 56151 | delux car #5 | 226890748623.0 | 7 |
25 | 309874 | delux car #6 | 587716839953.0 | 8 |
27 | 555988 | special car #1 | 894716423380.0 | 3 |
28 | 56151 | delux car #5 | 226890748623.0 | 9 |
29 | 165209 | basic car #4 | 756030704398.0 | 4 |
30 | 173254 | delux car #2 | 95610786143.0 | 5 |
31 | 309874 | delux car #6 | 587716839953.0 | 9 |
32 | 529847 | delux car #4 | 677741438376.0 | 9 |
33 | 555988 | special car #1 | 894716423380.0 | 3 |
34 | 582646 | basic car #2 | 299645942550.0 | 7 |
35 | 618993 | basic car #6 | 255072922341.0 | 9 |
36 | 646018 | delux car #3 | 800356544694.0 | 11 |
37 | 703883 | delux car #1 | 564293728345.0 | 4 |
38 | 740363 | basic car #3 | 454711139547.0 | 6 |
39 | 745818 | basic car #5 | 352031103297.0 | 9 |
40 | 862082 | basic car #1 | 581362817884.0 | 7 |
In [4]:
df_new.to_excel(output_file, index=False)
No comments:
Post a Comment