import osimport refrom tika import parserfolder = ".../test/input"pattern = r'^(\d+)\s+(\S+)\s+([0-9.]+ x [0-9]+ x [A-Z]+)\s+([0-9.]+)\s+([0-9.]+)'data = [] # List to store extracted datafor filename in os.listdir(folder): file_path = os.path.join(folder, filename) parsed_pdf = parser.from_file(file_path) if 'content' in parsed_pdf: text = parsed_pdf['content'] matches = re.findall(pattern, text, re.MULTILINE) for match in matches: data.append(match) # Append the extracted row data to the list else: print(f"Text extraction failed for file: {file_path}")# Print the extracted datafor row in data: print(row)I want to extract some data from a table in my pdf files, but some table has a extra column of data 'Quantity', how do I handle the conditions?
First type of data :
('57', '231228B23', '0.21 x 914 x C', '2.640', '2.680')('58', '231228B24', '0.21 x 914 x C', '2.682', '2.722')('59', '231228B25', '0.21 x 914 x C', '2.710', '2.750')('60', '231228B26', '0.21 x 914 x C', '2.714', '2.754')('61', '231228B27', '0.21 x 914 x C', '2.636', '2.676')('62', '231228B28', '0.21 x 914 x C', '2.628', '2.668')('63', '231228B29', '0.21 x 914 x C', '2.628', '2.668')('64', '231228A37', '0.21 x 914 x C', '2.684', '2.724')('65', '231228A38', '0.21 x 914 x C', '2.718', '2.758')('66', '231228A39', '0.21 x 914 x C', '2.646', '2.686')('67', '231228A40', '0.21 x 914 x C', '2.652', '2.692')Second type of data :
('7', '231228B25', '0.21 x 914 x C', '1', '2.710', '2.750')('8', '231228B26', '0.21 x 914 x C', '1', '2.714', '2.754')('9', '231228B27', '0.21 x 914 x C', '1', '2.636', '2.676')('10', '231228B28', '0.21 x 914 x C', '1', '2.628', '2.668')('11', '231228B29', '0.21 x 914 x C', '1', '2.628', '2.668')('12', '231228A37', '0.21 x 914 x C', '1', '2.684', '2.724')I do not need the Quantity column.