In [1]:
# this section of codes read 3 sheets in an excel file from the same directory
# the two sheets are combined together and same item is added together
# still need to figure out why dtype in read_excel doesn't work properly without any error message
# - dtype didn't read in properly because there are empty rows in the cells (Null / NaN)
# note that when there isn't any Null value, the cells / data types are consistent across multiple sheets
# 11/9/2022
import os
print(os.getcwd())
dirname = 'c:\\MLcourse\\HJ project'
os.chdir(dirname)
print(os.getcwd())
import pandas as pd
import numpy as np
file_name = "Multi_sheet.xlsx"
df = pd.read_excel(file_name, sheet_name=[0,1,2], header=1, usecols="B:E")
# if sheet_name=None, all sheets will be read in as part of the dictionary
# all sheets are stored in the dictionary and can be assessed using key / df[key]
df_add = pd.DataFrame()
print('# of items in df ->', len(df))
for key in df:
print(key, "\n", df[key])
df_add = df_add.append(df[key], ignore_index=True)
# drop Null rows and convert the data types
df_add.dropna(how='all',inplace=True)
df_add = df_add.astype({'Item #':'int64', 'Item description':'str', 'UPC code':'str', 'Quantity':'int64'})
# group items together and add up all the same item # quantity
df_group = df_add.groupby(['Item #','Item description','UPC code']).sum()
print(df_group)
C:\MLCourse\HJ project
C:\MLCourse\HJ project
# of items in df -> 3
0
Item # Item description UPC code Quantity
0 862082.0 basic car #1 5.813628e+11 4.0
1 582646.0 basic car #2 2.996459e+11 2.0
2 740363.0 basic car #3 4.547111e+11 3.0
3 165209.0 basic car #4 7.560307e+11 1.0
4 745818.0 basic car #5 3.520311e+11 3.0
5 618993.0 basic car #6 2.550729e+11 5.0
6 NaN NaN NaN NaN
7 703883.0 delux car #1 5.642937e+11 3.0
8 173254.0 delux car #2 9.561079e+10 2.0
9 646018.0 delux car #3 8.003565e+11 6.0
10 529847.0 delux car #4 6.777414e+11 3.0
11 56151.0 delux car #5 2.268907e+11 2.0
12 309874.0 delux car #6 5.877168e+11 1.0
1
Item # Item description UPC code Quantity
0 862082.0 basic car #1 5.813628e+11 3.0
1 582646.0 basic car #2 2.996459e+11 5.0
2 740363.0 basic car #3 4.547111e+11 3.0
3 165209.0 basic car #4 7.560307e+11 3.0
4 745818.0 basic car #5 3.520311e+11 6.0
5 618993.0 basic car #6 2.550729e+11 4.0
6 NaN NaN NaN NaN
7 703883.0 delux car #1 5.642937e+11 1.0
8 173254.0 delux car #2 9.561079e+10 3.0
9 646018.0 delux car #3 8.003565e+11 5.0
10 529847.0 delux car #4 6.777414e+11 6.0
11 56151.0 delux car #5 2.268907e+11 7.0
12 309874.0 delux car #6 5.877168e+11 8.0
13 NaN NaN NaN NaN
14 555988.0 special car #1 8.947164e+11 3.0
2
Item # Item description UPC code Quantity
0 56151 delux car #5 226890748623 9
1 165209 basic car #4 756030704398 4
2 173254 delux car #2 95610786143 5
3 309874 delux car #6 587716839953 9
4 529847 delux car #4 677741438376 9
5 555988 special car #1 894716423380 3
6 582646 basic car #2 299645942550 7
7 618993 basic car #6 255072922341 9
8 646018 delux car #3 800356544694 11
9 703883 delux car #1 564293728345 4
10 740363 basic car #3 454711139547 6
11 745818 basic car #5 352031103297 9
12 862082 basic car #1 581362817884 7
Quantity
Item # Item description UPC code
56151 delux car #5 226890748623.0 18
165209 basic car #4 756030704398.0 8
173254 delux car #2 95610786143.0 10
309874 delux car #6 587716839953.0 18
529847 delux car #4 677741438376.0 18
555988 special car #1 894716423380.0 6
582646 basic car #2 299645942550.0 14
618993 basic car #6 255072922341.0 18
646018 delux car #3 800356544694.0 22
703883 delux car #1 564293728345.0 8
740363 basic car #3 454711139547.0 12
745818 basic car #5 352031103297.0 18
862082 basic car #1 581362817884.0 14
C:\Users\guesswho\AppData\Local\Temp\ipykernel_21936\421086600.py:26: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_add = df_add.append(df[key], ignore_index=True) C:\Users\guesswho\AppData\Local\Temp\ipykernel_21936\421086600.py:26: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_add = df_add.append(df[key], ignore_index=True) C:\Users\guesswho\AppData\Local\Temp\ipykernel_21936\421086600.py:26: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_add = df_add.append(df[key], ignore_index=True)
In [2]:
# this sections reads all the excel files in the directory and combine them
# get current working directory, then change the current working directory to c:\MLcourse\HJ project\experiement
# import all .xlsx files into df_new and group_by Item #
import pandas as pd
import numpy as np
import os
print(os.getcwd())
dirname = 'c:\\MLcourse\\HJ project\\experiement'
output_file = "combined.xlsx"
os.chdir(dirname)
print(os.getcwd())
ext = ('.xlsx')
df_new = pd.DataFrame()
for files in os.listdir(dirname):
if files.endswith(ext):
print("file name - ", files) # printing file name of desired extension
df_new = df_new.append(pd.read_excel(files, header=1, usecols="B:E"), ignore_index=True)
else:
continue
print(df_new.dtypes)
df_new.dropna(how='all',inplace=True)
df_new = df_new.astype({'Item #':'int64', 'Item description':'str', 'UPC code':'str', 'Quantity':'int64'})
print("converted df_new", '\n', df_new)
print(df_new.dtypes)
df_new_group = df_new.groupby(['Item #','Item description','UPC code']).sum()
print(df_new_group)
C:\MLCourse\HJ project
c:\MLcourse\HJ project\experiement
file name - single-1.xlsx
file name - single-2.xlsx
file name - single-3.xlsx
Item # float64
Item description object
UPC code float64
Quantity float64
dtype: object
converted df_new
Item # Item description UPC code Quantity
0 862082 basic car #1 581362817884.0 4
1 582646 basic car #2 299645942550.0 2
2 740363 basic car #3 454711139547.0 3
3 165209 basic car #4 756030704398.0 1
4 745818 basic car #5 352031103297.0 3
5 618993 basic car #6 255072922341.0 5
7 703883 delux car #1 564293728345.0 3
8 173254 delux car #2 95610786143.0 2
9 646018 delux car #3 800356544694.0 6
10 529847 delux car #4 677741438376.0 3
11 56151 delux car #5 226890748623.0 2
12 309874 delux car #6 587716839953.0 1
13 862082 basic car #1 581362817884.0 3
14 582646 basic car #2 299645942550.0 5
15 740363 basic car #3 454711139547.0 3
16 165209 basic car #4 756030704398.0 3
17 745818 basic car #5 352031103297.0 6
18 618993 basic car #6 255072922341.0 4
20 703883 delux car #1 564293728345.0 1
21 173254 delux car #2 95610786143.0 3
22 646018 delux car #3 800356544694.0 5
23 529847 delux car #4 677741438376.0 6
24 56151 delux car #5 226890748623.0 7
25 309874 delux car #6 587716839953.0 8
27 555988 special car #1 894716423380.0 3
28 56151 delux car #5 226890748623.0 9
29 165209 basic car #4 756030704398.0 4
30 173254 delux car #2 95610786143.0 5
31 309874 delux car #6 587716839953.0 9
32 529847 delux car #4 677741438376.0 9
33 555988 special car #1 894716423380.0 3
34 582646 basic car #2 299645942550.0 7
35 618993 basic car #6 255072922341.0 9
36 646018 delux car #3 800356544694.0 11
37 703883 delux car #1 564293728345.0 4
38 740363 basic car #3 454711139547.0 6
39 745818 basic car #5 352031103297.0 9
40 862082 basic car #1 581362817884.0 7
Item # int64
Item description object
UPC code object
Quantity int64
dtype: object
Quantity
Item # Item description UPC code
56151 delux car #5 226890748623.0 18
165209 basic car #4 756030704398.0 8
173254 delux car #2 95610786143.0 10
309874 delux car #6 587716839953.0 18
529847 delux car #4 677741438376.0 18
555988 special car #1 894716423380.0 6
582646 basic car #2 299645942550.0 14
618993 basic car #6 255072922341.0 18
646018 delux car #3 800356544694.0 22
703883 delux car #1 564293728345.0 8
740363 basic car #3 454711139547.0 12
745818 basic car #5 352031103297.0 18
862082 basic car #1 581362817884.0 14
C:\Users\guesswho\AppData\Local\Temp\ipykernel_21936\2300699762.py:21: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_new = df_new.append(pd.read_excel(files, header=1, usecols="B:E"), ignore_index=True) C:\Users\guesswho\AppData\Local\Temp\ipykernel_21936\2300699762.py:21: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_new = df_new.append(pd.read_excel(files, header=1, usecols="B:E"), ignore_index=True) C:\Users\guesswho\AppData\Local\Temp\ipykernel_21936\2300699762.py:21: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_new = df_new.append(pd.read_excel(files, header=1, usecols="B:E"), ignore_index=True)
In [3]:
df_new
Out[3]:
| Item # | Item description | UPC code | Quantity | |
|---|---|---|---|---|
| 0 | 862082 | basic car #1 | 581362817884.0 | 4 |
| 1 | 582646 | basic car #2 | 299645942550.0 | 2 |
| 2 | 740363 | basic car #3 | 454711139547.0 | 3 |
| 3 | 165209 | basic car #4 | 756030704398.0 | 1 |
| 4 | 745818 | basic car #5 | 352031103297.0 | 3 |
| 5 | 618993 | basic car #6 | 255072922341.0 | 5 |
| 7 | 703883 | delux car #1 | 564293728345.0 | 3 |
| 8 | 173254 | delux car #2 | 95610786143.0 | 2 |
| 9 | 646018 | delux car #3 | 800356544694.0 | 6 |
| 10 | 529847 | delux car #4 | 677741438376.0 | 3 |
| 11 | 56151 | delux car #5 | 226890748623.0 | 2 |
| 12 | 309874 | delux car #6 | 587716839953.0 | 1 |
| 13 | 862082 | basic car #1 | 581362817884.0 | 3 |
| 14 | 582646 | basic car #2 | 299645942550.0 | 5 |
| 15 | 740363 | basic car #3 | 454711139547.0 | 3 |
| 16 | 165209 | basic car #4 | 756030704398.0 | 3 |
| 17 | 745818 | basic car #5 | 352031103297.0 | 6 |
| 18 | 618993 | basic car #6 | 255072922341.0 | 4 |
| 20 | 703883 | delux car #1 | 564293728345.0 | 1 |
| 21 | 173254 | delux car #2 | 95610786143.0 | 3 |
| 22 | 646018 | delux car #3 | 800356544694.0 | 5 |
| 23 | 529847 | delux car #4 | 677741438376.0 | 6 |
| 24 | 56151 | delux car #5 | 226890748623.0 | 7 |
| 25 | 309874 | delux car #6 | 587716839953.0 | 8 |
| 27 | 555988 | special car #1 | 894716423380.0 | 3 |
| 28 | 56151 | delux car #5 | 226890748623.0 | 9 |
| 29 | 165209 | basic car #4 | 756030704398.0 | 4 |
| 30 | 173254 | delux car #2 | 95610786143.0 | 5 |
| 31 | 309874 | delux car #6 | 587716839953.0 | 9 |
| 32 | 529847 | delux car #4 | 677741438376.0 | 9 |
| 33 | 555988 | special car #1 | 894716423380.0 | 3 |
| 34 | 582646 | basic car #2 | 299645942550.0 | 7 |
| 35 | 618993 | basic car #6 | 255072922341.0 | 9 |
| 36 | 646018 | delux car #3 | 800356544694.0 | 11 |
| 37 | 703883 | delux car #1 | 564293728345.0 | 4 |
| 38 | 740363 | basic car #3 | 454711139547.0 | 6 |
| 39 | 745818 | basic car #5 | 352031103297.0 | 9 |
| 40 | 862082 | basic car #1 | 581362817884.0 | 7 |
In [4]:
df_new.to_excel(output_file, index=False)
No comments:
Post a Comment