Analyse data for cement companies#

import time
import pandas as pd
import utils.common
import utils.async_utils
from utils.logging import logger
from utils.byte_genie import ByteGenie

init byte-genie#

init byte-genie in async mode (tasks will run in the background)#

bg_async = ByteGenie(
    secrets_file='secrets.json',
    task_mode='async',
    verbose=1,
)

init byte-genie in sync mode (tasks will run in the foreground)#

bg_sync = ByteGenie(
    secrets_file='secrets.json',
    task_mode='sync',
    verbose=1,
)

set inputs#

set company names#

company_names = [
    'Ultratech Cement',
    'Cemex Inc',
    'ACC Limited',
    'Heidelberg Materials Inc',
    'JK Cement',
    'Shree Cement',
    'China Resources Cement',
    'Eurocement Group',
    'Birla Corporation',
    'Lafarge Inc',
]

Data sourcing#

trigger document download#

resp = bg_async.download_documents(
    entity_names=company_names,
    doc_keywords=doc_keywords,
)

wait for output to exist#

time.sleep(60 * 60)

get output#



`.get_output()` checks if the `output_file` for the task exists, and if it does it returns the output.
If the `output_file` does not yet exist, `.get_output()` will not return anything, and just print a message to wait until output exists.


df_document_urls = pd.DataFrame(resp.get_output())

Get unique doc_name#

doc_names = df_document_urls['doc_name'].unique().tolist()

save df_document_urls to local file#

df_document_urls.to_csv(f"/tmp/document-urls_cement-companies.csv", index=False)

Extract document info for downloaded documents#

make api calls#

tasks = [
    bg_async.async_extract_doc_info(
        doc_name=doc_name
    )
    for doc_name in doc_names
]
doc_info_responses = utils.async_utils.run_async_tasks(tasks)

Retrieve output from API responses#

df_doc_info = [resp.get_output() for resp in doc_info_responses]
df_doc_info = [pd.DataFrame(df) for df in df_doc_info]
df_doc_info = pd.concat(df_doc_info)

check available output#

logger.info(f"{len(df_doc_info)} rows found in doc_info df")
"""
A sample of document info dataframe, `df_doc_info.head().to_dict('records')`
[
    {'doc_name': 'httpssustainabilityadityabirlacomabg-esg-reportabg-esg-full-report-2021-final-file-for-web-upload-28-feb-2022pdf',
     'doc_org': 'GSE', 'doc_type': "['annual report']", 'doc_year': 2022, 'num_pages': 82.0},
    {'doc_name': 'httpssustainabilityadityabirlacompdfreportspdfhindalco-sustainability-report-2016-17-2017pdf',
     'doc_org': 'Hindalco', 'doc_type': "['sustainability report']", 'doc_year': 2018, 'num_pages': 82.0},
    {'doc_name': 'httpssustainabilityadityabirlacompdfreportspdfpolicies_reports_pdf_30_1614145577pdf',
     'doc_org': 'Birla Cellulose', 'doc_type': "['sustainability report']", 'doc_year': 2009, 'num_pages': 79.0},
    {'doc_name': 'httpssustainabilityadityabirlacompdfreportspdfhindalco-sustainability-report-fy19pdf',
     'doc_org': 'Hindalco Industries Limited', 'doc_type': "['sustainability report']", 'doc_year': 2019,
     'num_pages': 132.0},
    {'doc_name': 'httpssustainabilityadityabirlacompdfreportspdfsustainability-report-20-21pdf', 'doc_org': 'Employee',
     'doc_type': "['sustainability report']", 'doc_year': 2021, 'num_pages': 120.0}
]
"""

check documents with missing document info#

missing_doc_names = [doc_name for doc_name in doc_names if doc_name not in df_doc_info['doc_name'].unique().tolist()]
logger.info(f"{len(missing_doc_names)} documents with missing doc_info output")

save document info locally#

tmp_file = f"/tmp/downloaded-docs_cement-companies.csv"
df_doc_info.to_csv(tmp_file)

read data from local file#

df_doc_info = pd.read_csv(tmp_file)

Process document info#

get unique doc_year#

doc_years = df_doc_info['doc_year'].unique().tolist()

convert years to numeric#

tasks = [
    bg_sync.async_extract_text_years(
        text=str(yr)
    )
    for yr in doc_years
]
extract_year_responses = utils.async_utils.run_async_tasks(tasks)
df_years_num = [pd.DataFrame(resp.get_output()) for resp in extract_year_responses]
df_years_num = pd.concat(df_years_num)
"""
Sample of numeric years, `df_years_num.to_dict('records')`
[
    {'text': '2022', 'year': '2022'}, 
    {'text': '2018', 'year': '2018'}, 
    {'text': '2009', 'year': '2009'},
    {'text': '2019', 'year': '2019'}, 
    {'text': '2021', 'year': '2021'},
    {'text': '2018-2019', 'year': '2018'},
    {'text': '2016-17', 'year': '2016'}, 
    {'text': '2019-20', 'year': '2019'},
]
As we can see, `/extract_text_years` has converted texts like '2019-20' and '2016-17' into numeric 4-digit years, 2019 and 2016, respectively.
"""

merge numeric years onto df_doc_info#

df_doc_info = pd.merge(
    left=df_doc_info,
    right=df_years_num.rename(
        columns={'text': 'doc_year',
                 'year': 'doc_year_num'}
    ),
    on=['doc_year'],
    how='left'
)

convert doc_year_num to float type#

df_doc_info['doc_year_num'] = df_doc_info['doc_year_num'].astype(float)

check numeric years#


A comparison of original and numeric document years,
`df_doc_info[['doc_year', 'doc_year_num']].drop_duplicates().values.tolist()`
[
['2022', '2022'], ['2018', '2018'], ['2009', '2009'], ['2019', '2019'], ['2021', '2021'], ['2017', '2017'],
['2020', '2020'], ['2014', '2014'], ['2023', '2023'], ['2020-21', '2020'], ['2018-2019', '2018'],
['2016-17', '2016'],
['2019-20', '2019'], ['2011', '2011'], ['2016', '2016'], ['2012', '2012'], ['2015', '2015'], ['2018-19', '2018'],
['2006', '2006'], [nan, nan], ['2013', '2013'], ['2011/2012', '2011'], ['2011/2012', '2012'], ['2002', '2002'],
['2007', '2007'], ['2010', '2010'], ['2004', '2004'], ['2019-2020', '2019']
]

convert doc_type to string format#

df_doc_info = utils.common.convert_list_cols_to_str(
    df=df_doc_info,
    cols=['doc_type']
)

check doc_type#


Unique document types inferred from document text, `df_doc_info['doc_type'].unique().tolist()`
['annual report', 'sustainability report', 'TCFD report', 'financial statement', nan, 'other', 'sustainable financing document', 'human rights policy', 'sustainable financing document; sustainability report', 'press release']

save data locally#

df_doc_info.to_csv(f"/tmp/downloaded-docs_cement-companies_processed.csv", index=False)

Merge doc_info with document_url data#

df_document_urls = pd.read_csv("/tmp/document-urls_cement-companies.csv")
df_doc_info = pd.read_csv(f"/tmp/downloaded-docs_cement-companies_processed.csv")
df_doc_details = pd.merge(
    left=df_document_urls,
    right=df_doc_info,
    on=['doc_name'],
    how='left'
)

check df_doc_details#


A sample of extracted document details, `df_doc_details.head().to_dict('records')`
[
{
'doc_name': 'httpssustainabilityadityabirlacomabg-esg-reportabg-esg-full-report-2021-final-file-for-web-upload-28-feb-2022pdf',
'entity_name': 'Ultratech Cement',
'href': 'https://sustainability.adityabirla.com/ABG-ESG-Report/ABG-ESG-Full-Report-2021-Final-File-for-Web-Upload-28-Feb-2022.pdf',
'href_text': nan, 'keyphrase': 'sustainability reports', 'page_summary': nan,
'result_html': '

ABG Report for PDF

adityabirla.com
https://sustainability.adityabirla.com › ABG-ES...
adityabirla.com
https://sustainability.adityabirla.com › ABG-ES...
PDF
Mar 9, 2022This report summarises the performance of ABG businesses, with their diversity of sectors, geographies, across Environment, Social and.
82 pages
',
'result_text': 'ABG Report for PDF\nadityabirla.com\nhttps://sustainability.adityabirla.com › ABG-ES...\nPDF\nMar 9, 2022 — This report summarises the performance of ABG businesses, with their diversity of sectors, geographies, across Environment, Social and.\n82 pages',
'doc_org': 'GSE', 'doc_type': "['annual report']", 'doc_year': '2022', 'num_pages': 82.0,
'doc_year_num': 2022.0},
{'doc_name': 'httpssustainabilityadityabirlacompdfreportspdfhindalco-sustainability-report-2016-17-2017pdf',
'entity_name': 'Ultratech Cement',
'href': 'https://sustainability.adityabirla.com/pdf/reportspdf/Hindalco-Sustainability-Report-2016-17-2017.pdf',
'href_text': nan, 'keyphrase': 'sustainability reports', 'page_summary': nan,
'result_html': '
Based on the financial year, our sustainability report is annually published3 and all our sustainability reports are available online on our website http://www.
',
'result_text': 'Hindalco Sustainability Report 2016 - 17\nadityabirla.com\nhttps://sustainability.adityabirla.com › reportspdf\nPDF\nBased on the financial year, our sustainability report is annually published3 and all our sustainability reports are available online on our website http://www.',
'doc_org': 'Hindalco', 'doc_type': "['sustainability report']", 'doc_year': '2018', 'num_pages': 82.0,
'doc_year_num': 2018.0},
{'doc_name': 'httpssustainabilityadityabirlacompdfreportspdfpolicies_reports_pdf_30_1614145577pdf',
'entity_name': 'Ultratech Cement',
'href': 'https://sustainability.adityabirla.com/pdf/reportspdf/policies_reports_pdf_30_1614145577.pdf',
'href_text': nan, 'keyphrase': 'sustainability reports', 'page_summary': nan,
'result_html': '

Sustainability Report 2019-20

adityabirla.com
https://sustainability.adityabirla.com › reportspdf
adityabirla.com
https://sustainability.adityabirla.com › reportspdf
PDF
Feb 24, 2021This report follows the structure of our first report where the first part showcases the intrinsic sustainability attributes of man-made ...
',
'result_text': 'Sustainability Report 2019-20\nadityabirla.com\nhttps://sustainability.adityabirla.com › reportspdf\nPDF\nFeb 24, 2021 — This report follows the structure of our first report where the first part showcases the intrinsic sustainability attributes of man-made ...',
'doc_org': 'Birla Cellulose', 'doc_type': "['sustainability report']", 'doc_year': '2009', 'num_pages': 79.0,
'doc_year_num': 2009.0},
{'doc_name': 'httpssustainabilityadityabirlacompdfreportspdfhindalco-sustainability-report-fy19pdf',
'entity_name': 'Ultratech Cement',
'href': 'https://sustainability.adityabirla.com/pdf/reportspdf/hindalco-sustainability-report-fy19.pdf',
'href_text': nan, 'keyphrase': 'sustainability reports', 'page_summary': nan,
'result_html': '

Sustainability report 2018-19

adityabirla.com
https://sustainability.adityabirla.com › reportspdf
adityabirla.com
https://sustainability.adityabirla.com › reportspdf
PDF
We publish our sustainability reports on an annual basis. All of our sustainability reports, including the previous sustainability.
',
'result_text': 'Sustainability report 2018-19\nadityabirla.com\nhttps://sustainability.adityabirla.com › reportspdf\nPDF\nWe publish our sustainability reports on an annual basis. All of our sustainability reports, including the previous sustainability.',
'doc_org': 'Hindalco Industries Limited', 'doc_type': "['sustainability report']", 'doc_year': '2019',
'num_pages': 132.0, 'doc_year_num': 2019.0},
{'doc_name': 'httpssustainabilityadityabirlacompdfreportspdfsustainability-report-20-21pdf',
'entity_name': 'Ultratech Cement',
'href': 'https://sustainability.adityabirla.com/pdf/reportspdf/sustainability-report-20-21.pdf', 'href_text': nan,
'keyphrase': 'sustainability reports', 'page_summary': nan,
'result_html': '
This chapter details EMIL\'s environmental journey, which involves responsible mining, energy initiatives, Scope1, Scope 2 and other air emissions, water ...
120 pages
',
'result_text': "REPORT 2020-21 - ABG Sustainability - Aditya Birla Group\nadityabirla.com\nhttps://sustainability.adityabirla.com › reportspdf\nPDF\nThis chapter details EMIL's environmental journey, which involves responsible mining, energy initiatives, Scope1, Scope 2 and other air emissions, water ...\n120 pages",
'doc_org': 'Employee', 'doc_type': "['sustainability report']", 'doc_year': '2021', 'num_pages': 120.0,
'doc_year_num': 2021.0}
]

save df_doc_details to local file#

df_doc_details.to_csv(f"/tmp/doc-details_cement-companies.csv", index=False)

filter documents#

read data from file#

df_doc_details = pd.read_csv(f"/tmp/doc-details_cement-companies.csv")

filter documents by doc_type, doc_year, num_pages#

df_doc_details = df_doc_details[
    (df_doc_details['doc_year_num'] > 2021) &
    (df_doc_details['doc_type'].str.contains('annual report|sustainability report')) &
    (df_doc_details['num_pages'] >= 20)
    ]

trigger processing for selected documents#

get document names#

doc_names = df_doc_details['doc_name'].unique().tolist()

trigger processing for documents, in batches of 15 documents, to avoid exceeding rate limit#

for doc_num, doc_name in enumerate(doc_names):
    logger.info(f"triggering processing for ({doc_num}/{len(doc_names)}): {doc_name}")
    resp_ = bg_async.structure_quants_pipeline(
        doc_name=doc_name,
    )
    if (doc_num > 0) and (doc_num % 15 == 0):
        time.sleep(1 * 60 * 60)

check if synthesized-quants data exists#

quant_files = {}
for doc_num, doc_name in enumerate(doc_names):
    logger.info(f"checking quants data for ({doc_num}/{len(doc_names)}): {doc_name}")
    quant_files_ = bg_sync.list_doc_files(
        doc_name=doc_name,
        file_pattern='variable_desc=synthesized-quants/**.csv',
    ).get_data()
    if quant_files_ is not None:
        logger.info(f"found {len(quant_files_)} quant files for {doc_name}")
        quant_files[doc_name] = quant_files_

check quant_files#


len(quant_files)
49
len(quant_files) == len(doc_names)
True

Handle missing output#


Note that sometimes document processing may fail to complete successfully due to some random errors, like API call time, rate limit errors, etc.without finishing.
In this case, the document processing pipeline can be triggered again. By default API calls check for previously existing output first,
and generate new output if the output does not already exists. Hence, re-triggering a document processing pipeline will
just fill up any missing output, while leaving the existing output intact.

rank data by relevance to most relevant topics#

define a set of relevant keyphrases to search in extracted data from documents#

keyphrases = {
    'quantitative': ['cement production', 'revenue', 'emissions by scope', 'emission intensity', 'energy consumption'],
    'qualitative': ['emission reduction measures', 'revenue growth projection', 'business risk',
                    'cement industry trends', 'decarbonisation plans', 'climate risks', 'climate resilience']
}
# ### set the fraction of rows to keep in ranked data
frac_rows_to_keep = 0.1

Rank quantitative and qualitative data by relevance to set of keyphrases for each document#



Now we can iterate over each document and keyphrase, and rank each document data by relevance to each keyphrase.
This will allow us to only filter out the most relevant portions of each document, for downstream processing and analyses.


responses = []
output_files = []
missing_files = []
## loop over every document
for doc_num, doc_name in enumerate(doc_names):
    ## loop over types of keyphrases (quantitative or qualitative)
    for type_num, keyphase_type in enumerate(keyphrases.keys()):
        ## loop over keyprases
        for keyphrase_num, keyphrase in enumerate(keyphrases[keyphase_type]):
            logger.info(f"{doc_name} ({doc_num}/{len(doc_names)}); "
                        f"{keyphase_type} ({type_num}/{len(keyphrases.keys())}); "
                        f"{keyphrase} ({keyphrase_num}/{len(keyphrases[keyphase_type])});")
            ## run data ranking
            resp = bg_async.rank_data(
                doc_name=doc_name,
                attr=keyphrase,
                attr_type=keyphase_type,
                frac_rows_to_keep=frac_rows_to_keep,
            )
            responses = responses + [resp]
            ## if output file exists
            if resp.check_output_file_exists():
                ## add output file to output_files
                output_files = output_files + [resp.get_output_file()]
            ## if output file does not exist
            else:
                ## add output file to missing_files
                missing_files = missing_files + [resp.get_output_file()]

check available output files for ranked data#

logger.info(f"{len(output_files)} available output files for ranked data")

check missing files for ranked data#

logger.info(f"{len(missing_files)} missing output files for ranked data")