Short-form PDF processing (step by step approach)

Short-form PDF processing (step by step approach)#

This example shows an example of processing pdf documents using a step by step approach, starting from running OCR, retrieving OCR results for text and tables, text segmentation, and ranking/filtering data

import io
import time
import base64
import pandas as pd
from PIL import Image
import utils.async_utils
from utils.logging import logger
from utils.byte_genie import ByteGenie

init byte-genie#

init byte-genie in async mode (tasks will run in the background)#

bg_async = ByteGenie(
    secrets_file='secrets.json',
    task_mode='async',
    verbose=1,
)

init byte-genie in sync mode (tasks will run in the foreground)#

bg_sync = ByteGenie(
    secrets_file='secrets.json',
    task_mode='sync',
    verbose=1,
)

select documents to process#

doc_names = [
    'userid_demo-genie_uploadfilename_renewal-of-hydrant-hosespdf',
    'userid_demo-genie_uploadfilename_aircon-servicingpdf',
    'userid_demo-genie_uploadfilename_repair-of-vehiclespdf',
    'userid_demo-genie_uploadfilename_utility-billspdf',
    'userid_demo-genie_uploadfilename_purchase-of-material-geocom-engineeringpdf',
    'userid_demo-genie_uploadfilename_works_costpdf',
]
"""
See document_processing/upload_files.py (.ipynb) or data_management/upload_files.py (.ipynb) to see how to upload documents 
"""

Extract page images#

extract page images from documents#

tasks = [
    bg_async.async_write_pdf_img(
        doc_name=doc_name
    )
    for doc_num, doc_name in enumerate(doc_names)
]
write_img_responses = utils.async_utils.run_async_tasks(tasks)

list extracted page images#

tasks = [
    bg_sync.async_list_doc_files(
        doc_name=doc_name,
        file_pattern=f"*.png",
        timeout=15 * 60,
    )
    for doc_name in doc_names
]
img_files = utils.async_utils.run_async_tasks(tasks)
"""
image files for the last document, img_files[-1].get_data()
[
    'gs://db-genie/entity_type=url/entity=userid_demo-genie_uploadfilename_purchase-of-material-geocom-engineeringpdf/data_type=unstructured/format=img/variable_desc=page-img/source=pdf-genie/purchase-of-material-geocom-engineeringpdf_pagenum-0.png', 
    'gs://db-genie/entity_type=url/entity=userid_demo-genie_uploadfilename_purchase-of-material-geocom-engineeringpdf/data_type=unstructured/format=img/variable_desc=page-img/source=pdf-genie/purchase-of-material-geocom-engineeringpdf_pagenum-1.png'
]
"""

Extract text from documents#

Run OCR on page images#

ocr_start_time = time.time()
responses = []
for doc_num, doc_name in enumerate(doc_names):
    logger.info(f"triggering OCR for ({doc_num}/{len(doc_names)}: {doc_name})")
    resp = bg_async.extract_text(
        doc_name=doc_name
    )
    responses = responses + [resp]

list OCR output files for text#

tasks = [
    bg_sync.async_list_doc_files(
        doc_name=doc_name,
        file_pattern="variable_desc=text-blocks/**.csv"
    )
    for doc_name in doc_names
]
ocr_text_files = utils.async_utils.run_async_tasks(tasks)
ocr_text_files = [resp.get_data() for resp in ocr_text_files if resp.get_data() is not None]
"""
Number of documents with OCR text files, len(ocr_text_files): 5
Number of OCR text files for one document, len(ocr_text_files[-1]): 2
OCR text files for one document: ocr_text_files[-1]
[
    'gs://db-genie/entity_type=url/entity=userid_demo-genie_uploadfilename_works_costpdf/data_type=unstructured/format=img/variable_desc=page-img/source=pdf-genie/works_costpdf_pagenum-0.png', 
    'gs://db-genie/entity_type=url/entity=userid_demo-genie_uploadfilename_works_costpdf/data_type=unstructured/format=img/variable_desc=page-img/source=pdf-genie/works_costpdf_pagenum-1.png'
]
"""

list OCR output files for tables#

tasks = [
    bg_sync.async_list_doc_files(
        doc_name=doc_name,
        file_pattern="variable_desc=table-cells/**.csv"
    )
    for doc_name in doc_names
]
ocr_table_files = utils.async_utils.run_async_tasks(tasks)
ocr_table_files = [resp.get_data() for resp in ocr_table_files if resp.get_data() is not None]
"""
Number of documents with OCR table output files, len(ocr_table_files): 5
Number of OCR table files for one document, len(ocr_table_files[-1]): 2
ocr_table_files[-1]
[
    'gs://db-genie/entity_type=url/entity=userid_demo-genie_uploadfilename_works_costpdf/data_type=semi-structured/format=csv/variable_desc=table-cells/source=esgnie.com/works_costpdf_pagenum-0_table-cells.csv', 
    'gs://db-genie/entity_type=url/entity=userid_demo-genie_uploadfilename_works_costpdf/data_type=semi-structured/format=csv/variable_desc=table-cells/source=esgnie.com/works_costpdf_pagenum-1_table-cells.csv'
]
"""

Format OCR table output#

Format OCR extracted tables#

OCR extracted tables return tables in a standardised (row, col, cell) format, and require an additional layer of processing to reconstruct the original table

trigger original table reconstruction#

OCR output generates table-cells files, which we will now use as an input to reconstruct original tables

responses = []
for doc_num, doc_name in enumerate(doc_names):
    logger.info(f"triggering original table reconstruction for ({doc_num}/{len(doc_names)}: {doc_name})")
    resp = bg_async.reconstruct_orig_tables(
        doc_name=doc_name,
        file_pattern='variable_desc=table-cells/**.csv',
    )
    responses = responses + [resp]

list original table files#

tasks = [
    bg_sync.async_list_doc_files(
        doc_name=doc_name,
        file_pattern='variable_desc=orig-table/**.csv',
    )
    for doc_name in doc_names
]
orig_table_files = utils.async_utils.run_async_tasks(tasks)
orig_table_files = [resp.get_data() for resp in orig_table_files]
"""
number of documents with original table files, len(orig_table_files): 6
original table files for one document, orig_table_files[-1]
[
    'gs://db-genie/entity_type=url/entity=userid_demo-genie_uploadfilename_works_costpdf/data_type=semi-structured/format=csv/variable_desc=orig-table/source=api-genie/works_costpdf_pagenum-0_table-cells_orig-table_tablenum-0.csv', 
    'gs://db-genie/entity_type=url/entity=userid_demo-genie_uploadfilename_works_costpdf/data_type=semi-structured/format=csv/variable_desc=orig-table/source=api-genie/works_costpdf_pagenum-1_table-cells_orig-table_tablenum-0.csv'
]
"""

Process extracted tables#

set files to read#

table_file = 'gs://db-genie/entity_type=url/entity=userid_demo-genie_uploadfilename_works_costpdf/data_type=semi-structured/format=csv/variable_desc=orig-table/source=api-genie/works_costpdf_pagenum-0_table-cells_orig-table_tablenum-0.csv'

read a table file#

df_table = bg_sync.read_file(table_file).get_data()
df_table = pd.DataFrame(df_table)

check table data#

logger.info(f"{len(df_table)} rows in df_table")
"""
list(df_table.columns)
['Description', 'Description_2', 'Item', 'Phase 1 Amount (S$)', 'Phase 2A & 2B Amount (S$)', 'Unit']
df_table.head().to_dict('records')
[
    {'Description': "Depressed Road and Lay-by (Cont'd)", 'Description_2': "Depressed Road and Lay-by (Cont'd)", 'Item': '', 'Phase 1 Amount (S$)': '', 'Phase 2A & 2B Amount (S$)': '', 'Unit': ''}, 
    {'Description': 'Painting; to parapet wall; in accordance to CAAS MOAS requirement', 'Description_2': '$ 9.40 / m2', 'Item': 'A', 'Phase 1 Amount (S$)': '100,828.00', 'Phase 2A & 2B Amount (S$)': '246,988.00', 'Unit': 'Item'}, 
    {'Description': 'Vehicular grating in hot dipped galvanised mild steel; 800 X 100mm thick; including all necessary fittings, fixings and accessories', 'Description_2': '$ 450.00 / m', 'Item': 'B', 'Phase 1 Amount (S$)': '283,500.00', 'Phase 2A & 2B Amount (S$)': '1,705,050.00', 'Unit': 'Item'}, 
    {'Description': 'Precast concrete plank; 650 X 75mm thick', 'Description_2': '$ 40.02 / m', 'Item': 'C', 'Phase 1 Amount (S$)': '11,205.60', 'Phase 2A & 2B Amount (S$)': '101,090.52', 'Unit': 'Item'}, {'Description': 'Precast concrete plank; 1150 X 75mm thick', 'Description_2': '$ 73.88 / m', 'Item': 'D', 'Phase 1 Amount (S$)': '20,686.40', 'Phase 2A & 2B Amount (S$)': '186,620.88', 'Unit': 'Item'}
]
"""

re-structure table data#

resp = bg_async.create_dataset(
    data=df_table.to_dict('records'),
    attrs=['product category', 'complete product description', 'key material used in the product',
           'per unit amount', 'phase 1 amount', 'phase 2 amount', 'currency']
)
if resp.check_output_file_exists():
    df_dataset = resp.read_output_data()
    df_dataset = pd.DataFrame(df_dataset)
else:
    logger.info(f"create_dataset() output is not yet ready: wait some more")

pivot df_dataset#

df_dataset_wide = df_dataset.pivot(
    index=['context', 'row_num'],
    columns='variable',
    values='value',
).reset_index()

check df_dataset_wide#

logger.info(f"shape of df_dataset_wide: {df_dataset_wide.shape}")
"""
list(df_dataset_wide.columns)
['context', 'row_num', 'complete product description', 'currency', 'key material used in the product', 'per unit amount', 'phase 1 amount', 'phase 2 amount', 'product category', 'relevant quote']
df_dataset_wide.drop(columns=['context', 'row_num', 'relevant quote']).head().to_dict('records')
[
    {'complete product description': 'Painting; to parapet wall; in accordance to CAAS MOAS requirement', 'currency': 'n/a', 'key material used in the product': 'n/a', 'per unit amount': '$ 9.40 / m2', 'phase 1 amount': '$ 100,828.00', 'phase 2 amount': '$ 246,988.00', 'product category': 'A'}, 
    {'complete product description': 'Vehicular grating in hot dipped galvanised mild steel; 800 X 100mm thick; including all necessary fittings', 'currency': 'n/a', 'key material used in the product': 'hot dipped galvanised mild steel', 'per unit amount': '$ 450.00 / m', 'phase 1 amount': '$ 283,500.00', 'phase 2 amount': '$ 1,705,050.00', 'product category': 'B'}, 
    {'complete product description': 'Precast concrete plank; 650 X 75mm thick', 'currency': 'n/a', 'key material used in the product': 'n/a', 'per unit amount': '$ 40.02 / m', 'phase 1 amount': '$ 11,205.60', 'phase 2 amount': '$ 101,090.52', 'product category': 'C'}, 
    {'complete product description': 'Precast concrete plank; 1150 X 75mm thick', 'currency': 'n/a', 'key material used in the product': 'n/a', 'per unit amount': '$ 73.88 / m', 'phase 1 amount': '$ 20,686.40', 'phase 2 amount': '$ 186,620.88', 'product category': 'D'}, 
    {'complete product description': 'Precast concrete plank; various length X 75mm thick', 'currency': 'n/a', 'key material used in the product': 'n/a', 'per unit amount': '$ 61.60 / m2', 'phase 1 amount': '$ 648.03', 'phase 2 amount': '$ 21,827.20', 'product category': 'E'}
]
"""

Format OCR text output#

Segment OCR extracted text#

OCR extracted text includes text/words along with their coordinates. It needs one more layer of intelligent processing to decide which words were grouped together into a single passage, or table in the original document, to reconstruct the original text.

segment_text_responses = []
for doc_num, doc_name in enumerate(doc_names):
    logger.info(f"triggering segment_text for ({doc_num}/{len(doc_names)}): {doc_name}")
    segment_text_resp = bg_async.segment_text(
        doc_name=doc_name,
    )
    segment_text_responses = segment_text_responses + [segment_text_resp]

read segment_text output#

segment_text_files = []
missing_segment_text_files = []
for resp_num, resp in enumerate(segment_text_responses):
    logger.info(f"processing segment_text resp: {resp_num}/{len(segment_text_responses)}")
    output_file = resp.get_output_file()
    if resp.check_output_file_exists():
        segment_text_files = segment_text_files + [resp.get_output_file()]
    else:
        missing_segment_text_files = missing_segment_text_files + [resp.get_output_file()]