Filter pages from documents most relevant to KPIs of interest#
In this example, we will identify the most relevant text and table files extracted from a few documents,
using the page filtering pipeline, which combines a number of steps into a single API call.
To see a step-by-step approach for filtering pages, see `document_processing/filter_relevant_pages.py`
import necessary libraries#
import os
import time
import uuid
import numpy as np
import pandas as pd
import utils.common
import utils.async_utils
from utils.logging import logger
from utils.byte_genie import ByteGenie
init byte-genie#
init byte-genie in async mode (tasks will run in the background)#
bg_async = ByteGenie(
secrets_file='secrets.json',
task_mode='async',
overwrite=0,
verbose=1,
)
init byte-genie in sync mode (tasks will run in the foreground)#
bg_sync = ByteGenie(
secrets_file='secrets_mcp.json',
task_mode='sync',
overwrite=0,
verbose=1,
)
Set inputs Set documents to process
doc_names = [
'httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf',
'httpsmultimedia3mcommwsmedia2292786o3m-2023-global-impact-reportpdf',
]
Set keyphrases by which to filter pages
keyphrases = [
'emission targets',
'emission reductions',
'hazardous waste',
'gender diversity',
'renewable energy',
'sustainable revenue'
]
Set maximum rank of pages to keep
file_rank_max = 3
# <p> `file_rank_max=3` will mean that after the files are ranked by relevance to keyphrases,
# only the top 3 ranked files will be kept for each keyphrase. </p>
Filter pages
tasks = [
bg_async.async_filter_pages_pipeline(
doc_name=doc_name,
keyphrases=keyphrases,
file_rank_max=file_rank_max
)
for doc_name in doc_names
]
filter_pages_responses = utils.async_utils.run_async_tasks(tasks)
filtered_pages = [resp.get_output() for resp in filter_pages_responses]
Check filtered_pages
Set documents to process
doc_names = [
'httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf',
'httpsmultimedia3mcommwsmedia2292786o3m-2023-global-impact-reportpdf',
]
Set keyphrases by which to filter pages
keyphrases = [
'emission targets',
'emission reductions',
'hazardous waste',
'gender diversity',
'renewable energy',
'sustainable revenue'
]
Set maximum rank of pages to keep
file_rank_max = 3
# <p> `file_rank_max=3` will mean that after the files are ranked by relevance to keyphrases,
# only the top 3 ranked files will be kept for each keyphrase. </p>
Filter pages
tasks = [
bg_async.async_filter_pages_pipeline(
doc_name=doc_name,
keyphrases=keyphrases,
file_rank_max=file_rank_max
)
for doc_name in doc_names
]
filter_pages_responses = utils.async_utils.run_async_tasks(tasks)
filtered_pages = [resp.get_output() for resp in filter_pages_responses]
Check filtered_pages
doc_names = [
'httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf',
'httpsmultimedia3mcommwsmedia2292786o3m-2023-global-impact-reportpdf',
]
Set keyphrases by which to filter pages
keyphrases = [
'emission targets',
'emission reductions',
'hazardous waste',
'gender diversity',
'renewable energy',
'sustainable revenue'
]
Set maximum rank of pages to keep
file_rank_max = 3
# <p> `file_rank_max=3` will mean that after the files are ranked by relevance to keyphrases,
# only the top 3 ranked files will be kept for each keyphrase. </p>
Filter pages
tasks = [
bg_async.async_filter_pages_pipeline(
doc_name=doc_name,
keyphrases=keyphrases,
file_rank_max=file_rank_max
)
for doc_name in doc_names
]
filter_pages_responses = utils.async_utils.run_async_tasks(tasks)
filtered_pages = [resp.get_output() for resp in filter_pages_responses]
Check filtered_pages
keyphrases = [
'emission targets',
'emission reductions',
'hazardous waste',
'gender diversity',
'renewable energy',
'sustainable revenue'
]
Set maximum rank of pages to keep
file_rank_max = 3
# <p> `file_rank_max=3` will mean that after the files are ranked by relevance to keyphrases,
# only the top 3 ranked files will be kept for each keyphrase. </p>
Filter pages
tasks = [
bg_async.async_filter_pages_pipeline(
doc_name=doc_name,
keyphrases=keyphrases,
file_rank_max=file_rank_max
)
for doc_name in doc_names
]
filter_pages_responses = utils.async_utils.run_async_tasks(tasks)
filtered_pages = [resp.get_output() for resp in filter_pages_responses]
Check filtered_pages
Because `filtered_pages` is the output of a pipeline, it retains output from all the steps in that pipeline.
Its output will be in the form a of dictionary, with each element of the dictionary containing output files
from one step of the pipeline.
Types of output available in filtered_pages
logger.info(f"Output keys: {list(filtered_pages[0].keys())}")
"""
<div>
<p>
Output keys in `filtered_pages` for one of the documents, `list(filtered_pages[0].keys())}`
[
'filtered_files', 'filtered_table_files', 'filtered_text_files', 'img_files', 'table_embedding_files',
'table_files', 'table_similarity_files', 'text_embedding_files', 'text_files', 'text_similarity_files'
]
Different keys in this dictionary contain outputs from different steps.
For example,
<ul>
<li> 'img_files' contains output from converting PDF document to page images; </li>
<li> 'text_files' contains text output files extracted via OCR and layout parsing; </li>
<li> 'text_embedding_files' contains text embedding files; </li>
</ul>
</p>
<p>
Sample of image files, `filtered_pages[0]['img_files']`
[
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=unstructured/format=img/variable_desc=page-img/source=pdf-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-0.png',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=unstructured/format=img/variable_desc=page-img/source=pdf-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-1.png',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=unstructured/format=img/variable_desc=page-img/source=pdf-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-2.png',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=unstructured/format=img/variable_desc=page-img/source=pdf-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-3.png',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=unstructured/format=img/variable_desc=page-img/source=pdf-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-4.png'
]
</p>
<p> Sample of text files, `filtered_pages[0]['text_files']`
[
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=embeddings/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-1_text-blocks_text-segments_embeddings.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=embeddings/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-2_text-blocks_text-segments_embeddings.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=embeddings/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-3_text-blocks_text-segments_embeddings.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=semi-structured/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-1_text-blocks_text-segments.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=semi-structured/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-2_text-blocks_text-segments.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=semi-structured/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-3_text-blocks_text-segments.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-1_text-blocks_text-segments_embeddings_similarity_query-emission-reductions.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-1_text-blocks_text-segments_embeddings_similarity_query-emission-targets.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-1_text-blocks_text-segments_embeddings_similarity_query-gender-diversity.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-1_text-blocks_text-segments_embeddings_similarity_query-hazardous-waste.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-1_text-blocks_text-segments_embeddings_similarity_query-renewable-energy.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-1_text-blocks_text-segments_embeddings_similarity_query-sustainable-revenue.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-2_text-blocks_text-segments_embeddings_similarity_query-emission-reductions.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-2_text-blocks_text-segments_embeddings_similarity_query-emission-targets.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-2_text-blocks_text-segments_embeddings_similarity_query-gender-diversity.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-2_text-blocks_text-segments_embeddings_similarity_query-hazardous-waste.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-2_text-blocks_text-segments_embeddings_similarity_query-renewable-energy.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-2_text-blocks_text-segments_embeddings_similarity_query-sustainable-revenue.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-3_text-blocks_text-segments_embeddings_similarity_query-emission-reductions.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-3_text-blocks_text-segments_embeddings_similarity_query-emission-targets.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-3_text-blocks_text-segments_embeddings_similarity_query-gender-diversity.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-3_text-blocks_text-segments_embeddings_similarity_query-hazardous-waste.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-3_text-blocks_text-segments_embeddings_similarity_query-renewable-energy.csv',
'gs://db-genie/entity_type=url/entity=httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf/data_type=similarity/format=csv/variable_desc=text-segments/source=layout-genie/httpsmultimedia3mcommwsmedia2053960o3m-pulp-and-paper-sourcing-policy-progress-report-may-2021-finalpdf_pagenum-3_text-blocks_text-segments_embeddings_similarity_query-sustainable-revenue.csv'
]
</p>
</div>
"""