Allora, @marioma, come promesso ho fatto un piccolo esempio in tkinter che apre e legge un file PDF (utilizzando il Tuo modo, vedi metodo di callback "_sel_file(self)") permette di impostare il separatore, eliminare le righe non interessanti, elaborare una riga manualmente (pensata per le intestazioni che possono avere spazi in più) e preparare automaticamente blocchi di righe dati (funziona per righe), magari dopo aver corretto qualche imprecisione di lettura, e, in fine, di generare un file csv (non utilizzo la libreria, però).
Per esporre i dati utilizzo due textbox cui è disattivato il ritorno a capo automatico, una serie di pulsanti attivati secondo il contesto permettono le "manipolazioni" se necessarie.
Appena fatto il primo test, sembra funzionare, "potrebbero" anche processarsi più pdf, eliminando l'intero contenuto in self.pdf_text e caricando un altro pdf ed aggiungere i dati, ma questo non lo ho sperimentato.
Spero Tu abbia guardato le classi, perché non vi è alcun aiuto nel codice, troppo grosso anche se è il minimo sindacale, ci vorrebbero molti accorgimenti in più, in ogni caso la
documentazione può essere di grande aiuto per ciò che è oscuro e "decifrare" è un ottimo esercizio.
Spero Ti sia utile
# -*- coding: utf-8 -*-
import os
import tkinter as tk
import tkinter.messagebox as msgb
import tkinter.filedialog as fdlg
import pdftotext
class App(tk.Tk):
def __init__(self):
super().__init__()
self.title('PDF => CSV')
p_file = tk.Frame(self)
p_file.grid(row=0, column=0, sticky='ew')
self.bt_sel_file = tk.Button(p_file, text='Seleziona PDF',
padx=5, pady=5,
command=self._sel_file)
self.bt_sel_file.grid(row=0, column=0, sticky='ew')
self.bt_close_file = tk.Button(p_file, text='Chiudi PDF',
padx=5, pady=5,
command=self._close_file)
self.bt_close_file.grid(row=0, column=1, sticky='ew')
self.bt_make_csv = tk.Button(p_file, text='Crea CSV',
padx=5, pady=5,
command = self._make_csv)
self.bt_make_csv.grid(row=0, column=2, sticky='ew')
self.bt_close = tk.Button(p_file, text='Esci',
padx=5, pady=5,
command=self.destroy)
self.bt_close.grid(row=0, column=3, sticky='ew')
p_file.grid_columnconfigure(0, uniform=1, weight=1)
p_file.grid_columnconfigure(1, uniform=1, weight=1)
p_file.grid_columnconfigure(2, uniform=1, weight=1)
p_file.grid_columnconfigure(3, uniform=1, weight=1)
p_text = tk.Frame(self)
p_text.grid(row=1, column=0, sticky='nsew')
lbl = tk.Label(p_text, text='Righe testo estratte dal PDF',
justify='left', padx=5, pady=5, anchor='w')
lbl.grid(row=0, column=0, columnspan=3, sticky='ew')
self.pdf_text = tk.Text(p_text, height=15, bg='#D5E711',
wrap='none', padx=5, pady=5)
self.pdf_text.grid(row=1, column=0, sticky='nsew')
v_scr_1 = tk.Scrollbar(p_text, orient=tk.VERTICAL,
command=self.pdf_text.yview)
self.pdf_text.configure(yscrollcommand=v_scr_1.set)
v_scr_1.grid(row=1, column=1, sticky='ns')
h_scr_1 = tk.Scrollbar(p_text, orient=tk.HORIZONTAL,
command=self.pdf_text.xview)
self.pdf_text.configure(xscrollcommand=h_scr_1.set)
h_scr_1.grid(row=2, column=0, sticky='ew')
p_bt_file = tk.Frame(p_text)
p_bt_file.grid(row=1, column=2, sticky='ns')
lbl = tk.Label(p_bt_file, text='Car. separatore:', anchor='w',
padx=5, pady=5)
lbl.grid(row=0, column=0)
self.e_sep = tk.Entry(p_bt_file, width=3)
self.e_sep.grid(row=1, column=0)
self.bt_del_line = tk.Button(p_bt_file, text='Elimina righe',
padx=5, pady=5, command=self._del_select_rows)
self.bt_del_line.grid(row=3, column=0, sticky='nsew')
self.bt_trasf_line = tk.Button(p_bt_file, text='Trasferisci riga',
padx=5, pady=5, command=self._data_transfer)
self.bt_trasf_line.grid(row=4, column=0, sticky='nsew')
self.bt_elab_lines = tk.Button(p_bt_file, text='Elabora righe',
padx=5, pady=5, command=self._elab_rows)
self.bt_elab_lines.grid(row=5, column=0, sticky='nsew')
p_bt_file.grid_rowconfigure(2, uniform=2, weight=1)
p_bt_file.grid_rowconfigure(3, uniform=2, weight=1)
p_bt_file.grid_rowconfigure(4, uniform=2, weight=1)
p_bt_file.grid_rowconfigure(5, uniform=2, weight=1)
p_text.grid_columnconfigure(0, weight=1)
p_text.grid_rowconfigure(1, weight=1)
p_csv = tk.Frame(self)
p_csv.grid(row=2, column=0, sticky='nsew')
lbl = tk.Label(p_csv, text='Righe elaborate per il CSV',
justify='left', padx=5, pady=5, anchor='w')
lbl.grid(row=0, column=0, columnspan=2, sticky='ew')
self.csv_text = tk.Text(p_csv, height=15, bg='#E7D011',
wrap='none', padx=5, pady=5)
self.csv_text.grid(row=1, column=0, sticky='nsew')
v_scr_2 = tk.Scrollbar(p_csv, orient=tk.VERTICAL,
command=self.csv_text.yview)
self.csv_text.configure(yscrollcommand=v_scr_2.set)
v_scr_2.grid(row=1, column=1, sticky='ns')
h_scr_2 = tk.Scrollbar(p_csv, orient=tk.HORIZONTAL,
command=self.csv_text.xview)
self.csv_text.configure(xscrollcommand=h_scr_2.set)
h_scr_2.grid(row=2, column=0, sticky='ew')
p_csv.grid_columnconfigure(0, weight=1)
p_csv.grid_rowconfigure(1, weight=1)
self.grid_columnconfigure(0, weight=1)
self.grid_rowconfigure(1, weight=1)
self.grid_rowconfigure(2, weight=1)
self.init_dir = os.getenv('HOME')
self._initialize()
def _initialize(self):
self.e_sep.delete(0, 'end')
self.e_sep.insert(0, ';')
self.bt_sel_file.configure(state='normal')
self.bt_close_file.configure(state='disabled')
self.bt_make_csv.configure(state='disabled')
self.bt_close.configure(state='normal')
self.bt_del_line.configure(state='disabled')
self.bt_trasf_line.configure(state='disabled')
self.bt_elab_lines.configure(state='disabled')
self.pdf_text.delete('1.0', 'end')
self.pdf_text.configure(state='disabled')
self.csv_text.delete('1.0', 'end')
self.csv_text.configure(state='disabled')
def _sel_file(self):
f_types =[('File pdf', '*.pdf')]
try:
f = fdlg.askopenfile(parent=self,
initialdir = self.init_dir,
title='Selezione PDF',
mode='rb',
filetypes=f_types)
lines = None
if f:
self.init_dir = os.path.dirname(f.name)
pdf = pdftotext.PDF(f)
f.close()
lines = pdf[0].splitlines()
if lines:
self._set_data(lines)
except OSError:
msgb.showerror(title='Apertura fallita', message='Errore lettura file')
def _set_data(self, data=None):
if not data: return
self.pdf_text.configure(state='normal')
for row in data:
self.pdf_text.insert('end', row +'\n')
self.pdf_opened = True
self._evaluate_state()
def _evaluate_state(self):
if len(self.pdf_text.get('1.0', 'end-1c')) == 0:
self.bt_close_file.configure(state='disabled')
self.bt_del_line.configure(state='disabled')
self.bt_trasf_line.configure(state='disabled')
self.bt_elab_lines.configure(state='disabled')
else:
self.bt_close_file.configure(state='normal')
self.bt_del_line.configure(state='normal')
self.bt_trasf_line.configure(state='normal')
self.bt_elab_lines.configure(state='normal')
if len(self.csv_text.get('1.0', 'end-1c')) == 0:
self.bt_make_csv.configure(state='disabled')
else:
self.bt_make_csv.configure(state='normal')
# endline, endcolumn = text.index('end').split('.')
# currentline, currentcolum = text.index('current').split('.')
# self.Text.delete(line_number + '.0', line_number + '.end + 1 char')
def _get_rows_intervall(self):
if self.pdf_text.tag_ranges(tk.SEL):
init_index = self.pdf_text.index(tk.SEL_FIRST)
end_index = self.pdf_text.index(tk.SEL_LAST)
else:
init_index = self.pdf_text.index(tk.INSERT)
end_index = init_index
first = int(init_index.split('.')[0])
last = int(end_index.split('.')[0])
return [x for x in range(first, last+1)]
def _del_select_rows(self):
rows = self._get_rows_intervall()
if not rows: return
rows.reverse()
for n in rows:
self.pdf_text.delete(str(n)+'.0', str(n)+'.end + 1 char')
self._evaluate_state()
def _data_transfer(self):
current = self.pdf_text.index(tk.INSERT)
row = current.split('.')[0]
text = self.pdf_text.get(row + '.0', row + '.end')
self.csv_text.configure(state='normal')
self.csv_text.insert('end', text +'\n')
self.csv_text.configure(state='disabled')
self.pdf_text.delete(row + '.0', row + '.end + 1 char')
self._evaluate_state()
def _elab_rows(self):
rows = self._get_rows_intervall()
if not rows: return
sep = self.e_sep.get()
if len(sep) != 1: return
if not sep: return
self.csv_text.configure(state='normal')
for n in rows:
text = self.pdf_text.get(str(n)+'.0', str(n)+'.end')
words = text.split()
text = sep.join(words)
self.csv_text.insert('end', text +'\n')
self.csv_text.configure(state='disabled')
rows.reverse()
for n in rows:
self.pdf_text.delete(str(n)+'.0', str(n)+'.end + 1 char')
self._evaluate_state()
def _close_file(self):
self._initialize()
def _make_csv(self):
f_types = [('Comma separed values', '*.csv')]
f_name = fdlg.asksaveasfilename(parent=self,
initialdir = self.init_dir,
title='Creazione CSV',
confirmoverwrite=False,
filetypes=f_types)
if not f_name: return
self.csv_text.configure(state='normal')
text = self.csv_text.get('1.0', 'end-1c')
try:
with open(f_name, 'w') as f:
f.write(text)
except OSError:
msgb.showerror(title='Generazione fallita',
message='Errore scrittura file')
return
self.csv_text.delete('1.0', 'end')
self.csv_text.configure(state='disabled')
self._evaluate_state()
if __name__ == '__main__':
app = App()
app.mainloop()