I am using Invoice2data
and pandas to scan PDF files and insert them I excel.
The Excel file is pretty big, so I have extracted the ones that are of interest. The problem I face is when I am searching for the cell where I need to place my values.
I have tried multiple methods form these suggestions :"https://stackoverflow.com/questions/18327624/find-elements-index-in-pandas-series", however while I can find the index of the first 5 to 10 invoice nr. The same is not true when I search for ones above that. Can anyone help me?
Here is my code:
from invoice2data import extract_datafrom invoice2data.extract.loader import read_templatesimport osimport pandas as pddef processinvoices_in_folder(folder_path, start_row): # Først fil navnet, sheetnavnet også df = pd.read_excel('MODI.xlsx',header=None,sheet_name='Dokumentations_udtræk',usecols='P,AA:AM',skiprows=start_row) se = pd.Series(df[15]) p = 0 templates = read_templates('\Templates') fak_name = [] fak_indhold = [] for filename in os.listdir(folder_path): p += 1 splid = filename.split('-') if len(splid) == 6: fak_nr = splid[3] +'-'+ splid[4] else: fak_nr = splid[3] filtered_indexes = (se == int(fak_nr)).index if not filtered_indexes.empty: pos = filtered_indexes[0] else: print(f'Kunne ikke finde Fak_nr {fak_nr}\n Faktura nr: {p}') continue input_file = os.path.join(folder_path, filename) try: result = extract_data(input_file, templates=templates) del(result['beskrivelse'][0]) df.at[pos, 26] = filename.replace('.pdf', "") df.at[pos, 27] = result['amount'] df.at[pos, 28] = result['date'].date().strftime("%d-%m-20%y") df.at[pos, 29] = result['firma'] df.at[pos, 30] = 'M' df.at[pos, 31] = result['ordre_nr'] concat = ",".join(result['beskrivelse']) df.at[pos, 32] = concat df.at[pos, 33] = result['amount'] except Exception as e: result = f"\nError: {e}\n" fak_name.append(filename) fak_indhold.append(result) print(f'PDF nr. {p}, filnavn: {filename} \n{result}\n') data_output = df.loc[:, [26, 27, 28, 29, 30, 31, 32, 33]] data_output[27] = data_output[27].replace(to_replace=',', value='', regex=True) data_output[33] = data_output[33].replace(to_replace=',', value='', regex=True) start_column = 26 # Starting from column AA with pd.ExcelWriter('MODI.xlsx', mode='a', if_sheet_exists="overlay", date_format="DD-MM-YYYY") as writer: data_output.to_excel(writer, sheet_name='Dokumentations_udtræk', index=False, header=False, startrow=start_row, startcol=start_column) return fak_name, fak_indhold# folder path skal skrives som 'Faktura/Eksempelmappe - kopierede fakturaer' Start_row skal være tallet -1fak_name, fak_indhold = processinvoices_in_folder(folder_path = 'Faktura\_1omgang_17', start_row = 10104-1)
And this is the error i keep getting:
Traceback (most recent call last): File "C:\Users\modi\AppData\Local\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "<ipython-input-70-1acc93cec142>", line 1, in <module> list(se).index(106545394ValueError: 106545394 is not in list