Spaces:

binery
/

Bank_Statement_Parser

Runtime error

App Files Files Community

Bank_Statement_Parser / file_utils.py

binery

Upload 8 files

16aad69 over 2 years ago

raw

history blame contribute delete

2.63 kB

	import os
	import cv2
	import math
	import numpy as np
	import pandas as pd
	from pdf2image import convert_from_bytes

	import streamlit as st


	def get_img(uploaded_file):
	# convert file bytes into cv2 image
	file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
	img = cv2.imdecode(file_bytes, 1)
	return img


	def convert_pdf_to_image(filename):
	# * returns back a list of images according to the pdf pages
	pdf_pages = convert_from_bytes(filename, 500)
	return pdf_pages


	def filter_color(img):
	hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

	# define range of black color in HSV

	lower_val = np.array([0, 0, 0])

	upper_val = np.array([179, 100, 130])

	# Threshold the HSV image to get only black colors

	mask = cv2.inRange(hsv, lower_val, upper_val)

	# Bitwise-AND mask and original image

	res = cv2.bitwise_not(mask)
	return res


	def plot(img, boxes):
	FONT_SCALE = 1e-3
	THICKNESS_SCALE = 1e-3
	TEXT_Y_OFFSET_SCALE = 2.5e-2
	height, width, _ = img.shape

	font_scale = min(width, height) * FONT_SCALE
	thickness = math.ceil(min(width, height) * THICKNESS_SCALE)

	tmp = img.copy()
	for box in boxes:
	top_left = (int(box[0]), int(box[1]))
	bottom_right = (int(box[2]), int(box[3]))

	tmp = cv2.rectangle(tmp, top_left, bottom_right,
	(0, 0, 255), thickness)

	text = str(round(float(box[4]), 2))

	cv2.putText(
	tmp,
	text,
	(int(box[0]), int(box[1]) + int(height * TEXT_Y_OFFSET_SCALE)),
	cv2.FONT_HERSHEY_SIMPLEX,
	font_scale,
	(0, 0, 255),
	thickness,
	)
	return tmp


	def delete_file(filename):
	if os.path.exists(filename):
	os.remove(filename)


	def save_excel_file(
	idx, df: pd.DataFrame, foldername, filename, page_enumeration: int = 0
	):
	df.to_csv(
	f"{foldername}/{filename}page{page_enumeration}table{idx}.csv",
	index=False,
	)


	def concat_csv(folder, filename: str):
	df = pd.DataFrame()
	foldername = folder.name
	files = list(
	sorted(
	os.listdir(foldername), key=lambda x: x.split("page")[1].split("table")[0]
	)
	)
	columns = []
	for idx, file in enumerate(files):
	tmp = pd.read_csv(f"{foldername}/{file}")
	try:
	if idx == 0:
	columns = tmp.iloc[0]
	df = pd.concat([df, tmp[1:]])
	except:
	continue

	if not df.empty:
	df.columns = columns
	st.dataframe(df)
	df.to_csv(filename, index=False)