Spaces:
Runtime error
Runtime error
| import os | |
| import cv2 | |
| import math | |
| import numpy as np | |
| import pandas as pd | |
| from pdf2image import convert_from_bytes | |
| import streamlit as st | |
| def get_img(uploaded_file): | |
| # convert file bytes into cv2 image | |
| file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8) | |
| img = cv2.imdecode(file_bytes, 1) | |
| return img | |
| def convert_pdf_to_image(filename): | |
| # * returns back a list of images according to the pdf pages | |
| pdf_pages = convert_from_bytes(filename, 500) | |
| return pdf_pages | |
| def filter_color(img): | |
| hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) | |
| # define range of black color in HSV | |
| lower_val = np.array([0, 0, 0]) | |
| upper_val = np.array([179, 100, 130]) | |
| # Threshold the HSV image to get only black colors | |
| mask = cv2.inRange(hsv, lower_val, upper_val) | |
| # Bitwise-AND mask and original image | |
| res = cv2.bitwise_not(mask) | |
| return res | |
| def plot(img, boxes): | |
| FONT_SCALE = 1e-3 | |
| THICKNESS_SCALE = 1e-3 | |
| TEXT_Y_OFFSET_SCALE = 2.5e-2 | |
| height, width, _ = img.shape | |
| font_scale = min(width, height) * FONT_SCALE | |
| thickness = math.ceil(min(width, height) * THICKNESS_SCALE) | |
| tmp = img.copy() | |
| for box in boxes: | |
| top_left = (int(box[0]), int(box[1])) | |
| bottom_right = (int(box[2]), int(box[3])) | |
| tmp = cv2.rectangle(tmp, top_left, bottom_right, | |
| (0, 0, 255), thickness) | |
| text = str(round(float(box[4]), 2)) | |
| cv2.putText( | |
| tmp, | |
| text, | |
| (int(box[0]), int(box[1]) + int(height * TEXT_Y_OFFSET_SCALE)), | |
| cv2.FONT_HERSHEY_SIMPLEX, | |
| font_scale, | |
| (0, 0, 255), | |
| thickness, | |
| ) | |
| return tmp | |
| def delete_file(filename): | |
| if os.path.exists(filename): | |
| os.remove(filename) | |
| def save_excel_file( | |
| idx, df: pd.DataFrame, foldername, filename, page_enumeration: int = 0 | |
| ): | |
| df.to_csv( | |
| f"{foldername}/{filename}page{page_enumeration}table{idx}.csv", | |
| index=False, | |
| ) | |
| def concat_csv(folder, filename: str): | |
| df = pd.DataFrame() | |
| foldername = folder.name | |
| files = list( | |
| sorted( | |
| os.listdir(foldername), key=lambda x: x.split("page")[1].split("table")[0] | |
| ) | |
| ) | |
| columns = [] | |
| for idx, file in enumerate(files): | |
| tmp = pd.read_csv(f"{foldername}/{file}") | |
| try: | |
| if idx == 0: | |
| columns = tmp.iloc[0] | |
| df = pd.concat([df, tmp[1:]]) | |
| except: | |
| continue | |
| if not df.empty: | |
| df.columns = columns | |
| st.dataframe(df) | |
| df.to_csv(filename, index=False) | |