API Documentation#

add_embeddings_to_data#

Add embeddings to document files

data: jsonList = data for which to create embeddings

cols_to_use: list = columns to use for creating text embeddings

model: str = model to use (ada, transformers, etc) (optional)

chunk_size: int = number of rows to keep in one chunk when generating embeddings


aggregate_data#

Aggregate data to facilitate answering a given query

data: jsonList = input data

data_context: str = a short description of the table (optional)

cols_to_use: list = columns for which to generate meta-data (optional)

max_context_len: int = maximum context length for LLM (optional)

calc_mode: str = how to perform calculations (optional)

  • sync: synchronous mode

  • async: asynchronous mode (default)

  • parallel: parallel mode


check_file_exists#

Check if a file exists

file: str = file path


check_file_size#

Check file size

file: str = file for which to check size


classify_text#

Classify text

text: str = input text to classify

labels: list = a set of labels to classify text into

multi_class: bool = whether multiple labels are allowed or not


command_cluster#

Execute commands on cluster

cluster_name: str = name of the cluster to run commands on

commands: list = list of commands to run on cluster


compare_entities#

Compare strengths and weaknesses of entities (e.g. companies) by topics, based on an input dataset containing descriptions of relevant activties

cols_to_compare: list = columns of the input data to use to compare entities

entity_col: str = column containing entity names

topic_col: str = column containing topics by which to compare entities

groupby_cols: list = list of columns to group data by, so that the text in each group is summarised independently

query_cols: list = columns related to stranghths and weaknesses, e.g. description of strengths, description of weaknesses, etc.

output_cols: list = output column names

context_cols: list = columns to keep in the output data to track the source of summarise text (e.g. pagenum, doc_name)

max_context_len: int = maximum length of input text to be summarised together: input text will be broken into chunks of max_context_len


compare_entities_by_themes#

Compare similarities and differences between entities (e.g. companies) by topics, based on an input dataset containing descriptions of relevant activties

cols_to_compare: list = columns of the input data to use to compare entities

entity_col: str = column containing entity names

topic_col: str = column containing topics by which to compare entities

groupby_cols: list = list of columns to group data by, so that the text in each group is summarised independently

query_cols: list = columns related to stranghths and weaknesses, e.g. description of strengths, description of weaknesses, etc.

output_cols: list = output column names

context_cols: list = columns to keep in the output data to track the source of summarise text (e.g. pagenum, doc_name)

max_context_len: int = maximum length of input text to be summarised together: input text will be broken into chunks of max_context_len


convert_jsonl_to_csv#

Convert a json list to csv string

data: jsonList = data to convert to csv


convert_pdf_to_markdown#

Convert PDF file to markdown


convert_py_to_ipynb#

Convert python scripts to ipynb notebooks

git_username: str = git username

repo_name: str = git repo name

access_key: str = access key for git account

target_folder: str = target folder inside code repo where the files need to be converted (optional)—if not specified all .py files in the repo will be converted

branch_name: str = branch to use for converting py files to ipynb files (optional; main by default)


convert_units#

Convert units

src_unit: str = source unit

tgt_unit: str = target unit


correct_data_errors#

Correct errors in the data extracted. Given source and dervided columns, the function aims to find values in the derived columns that are inconsistent with information in source columns.

source_cols: list = list of column names to use as source data or ground trugh

derived_cols: list = list of columns derived from source data


create_dataset#


create_dataset_from_tables#

Create a dataset from tables extracted from a document

doc_name: str = document name

attrs: list = attributes/columns to have in the dataset


create_dataset_from_text#

Create a dataset from text passages extracted from a document

doc_name: str = document name

attrs: list = attributes/columns to have in the dataset


create_query_variants#

Create multiple variants of a data query based on meta-data

metadata: jsonList = a json list containing meta-data

query: str = query to run on the data

n_variants: int = number of query variants to generate


create_synonyms#

Create synonyms to an input text

text: str = text for which to create synonyms


create_text_embeddings#

Create text embeddings

docs: list = text passages to embed

model: str = model to use (ada, transformers, etc)

chunk_size: int = number of docs to keep in one chunk when generating embeddings


create_text_graph#

Create knowledge graph from text data

cols_to_use: list = columns to use in the data to create graph

groupby_cols: list = list of columns to group data by, so that the text in each group is summarised independently

output_cols: list = output column names

context_cols: list = columns to keep in the output data to track the source of summarise text (e.g. pagenum, doc_name)

max_context_len: int = maximum length of input text to be summarised together: input text will be broken into chunks of max_context_len


delete_workspace#

Delete workspace

username: str = username

filename: str = filename to read


download_document_urls#

Download document URLs for given entity names and document keyphrase

entity_names: list = list of entity/organisation names

doc_keywords: list = list of keywords to download documents by


download_documents#

Download documents from the homepage of organisations

entity_names: list = list of organisations to find homepage for


download_file#

Download a webpage/file from a list of urls

urls: list = list of urls to download


download_matching_files#

Download files matching a file extension from a list of urls

urls: list = list of urls from which to download matching files


embed_doc_data#

embed document data

doc_name: str = document name

file_pattern: str = file pattern to select input files

cols_to_use: list = columns to use for creating text embeddings

model: str = model to use (ada, transformers, etc) (optional)

chunk_size: int = number of rows to keep in one chunk when generating embeddings


estimate_values#

Estimate values of given attributes based on a dataset

text_data: jsonList = input data from which to estimate values

cols_to_use: list = columns to use in input data for estimating values

metrics_to_estimate: list = metrics to estimate

groupby_cols: list = columns to group input data by, so that values will be estimated separately for each group

context_cols: list = columns to keep in the output data to track the source (e.g. pagenum, doc_name)


extract_attributes#

Extract structured attribute-value pairs from unstructured text data


extract_doc_author#

Extract the author of the document

doc_name: str = name of the document

doc_ext: str = document extension

questions: list = questions to ask to identify document author

max_pages: int = maximum number of pages to consider in the document to identify document author


extract_doc_info#

Extract document info

doc_name: str = name of the document

doc_type_choices: str = document types to classify documents into


extract_doc_org#

Extract organisation name that published the document

doc_name: str = name of the document

doc_ext: str = document extension (optional)

questions: list = questions to ask to identify document organisation

max_pages: int = maximum number of pages to consider in the document to identify document organisation


extract_doc_type#

Classify document type into a pre-defined list of categories

doc_name: str = document name

doc_ext: str = document extension

questions: list = questions to ask to identify document type

doc_type_choices: list = list of document types to choose from

max_pages: int = maximum number of pages to consider in the document to identify document type


extract_doc_year#

Extract the document publication year

doc_name: str = document name

doc_ext: str = document extension (optional)


extract_num_pages#

Extract number of pages from a document

doc_name: str = document name

doc_ext: str = document extension


extract_orig_tables#

Extract tables in their original form from the document

doc_name: str = document name

doc_ext: str = document extension


extract_quant_metrics#

Extract structured quant metrics from unstructured text data


extract_text#

Extract text from images


extract_text_pipeline#

Run text extraction pipeline all the way from OCR to text segmentation and table re-construction

doc_name: str = document name

doc_ext: str = document extension (optional)


extract_text_years#


extractive_qa#

Extract answers to questions

context: str = context to use to answer the question

question: str = question


filter_columns#

Filter columns by relevance to a query

metadata: jsonList = meta-data of the table containing column names and their descriptions

query: str = query to be run on the data

max_context_len: int = maximum context length for LLM (optional)

calc_mode: str = how to perform calculations (optional)

  • sync: synchronous mode

  • async: asynchronous mode (default)

  • parallel: parallel mode


filter_data#

Filter data

data: jsonList = input data

query: str = filtering query

method: str = filtering method (optional)

  • query-relevance: filter data by relevance to the query

  • one-step: filter data based on all the conditions specified in the query in one step

  • multi-step: filter data based on each condition in the query, one step at a time (default)

calc_mode: str = how to perform calculations (optional)

  • sync: synchronous mode

  • async: asynchronous mode (default)

  • parallel: parallel mode


filter_similarity_scored_data#


find_homepage#

Find homepage for a list of organisations

entity_names: list = list of organisation names


format_recipe_tasks#

Format tasks in a recipe

recipe: str = recipe of tasks

line_break: str = line break separating various tasks (optional)


generate_api_key#

Generate api key

username: str = username

password: str = password

email: str = email

key_name: str = name for the key


generate_metadata#

Generate meta-data for tabular data

data: jsonList = input data

data_context: str = a short description of the table (optional)

cols_to_use: list = columns for which to generate meta-data (optional)

calc_mode: str = how to perform calculations (optional)

  • sync: synchronous mode

  • async: asynchronous mode (default)

  • parallel: parallel mode


generate_metadata#

Generate meta-data for tabular data

data: jsonList = input data

data_context: str = a short description of the table (optional)

cols_to_use: list = columns for which to generate meta-data (optional)

model: str = model to use for generating meta-data (optional)

calc_mode: str = how to perform calculations (optional)

  • sync: synchronous mode

  • async: asynchronous mode (default)

  • parallel: parallel mode


generate_signed_url#

Generate a pre-signed URL for a file

file: str = file for which to generate URL

time_ext: int = expiration time for the generated URL (optional)


generate_training_data#

Generate training data in desired format

doc_name: str = doc_name from which to generate training data

data_format: str = format of training data (masked-original-tables, masked-structured-data, data-structuring, recall-original-data, generative-qa)

masked_token: str = token to use for masked data (optional)


get_html_tables#

Extract HTML tables from a url


get_last_modified_time#

Get last modified time for a file

file: str = file path for which to get last modified time


get_usage_summary#

Calculates usage summary

username: str = username

start_time: int = start time for usage

end_time: int = end time for usage

route: str = usage route (‘api’, ‘app’, or ‘all’) (optional; ‘all’ by default)


group_topics#

Group topics, and return a dataframe with parent and child topics.

topics: list = list of input topics, which need to be grouped together


infer_from_text#

Make inferences on text (deprecated, use multiple_choice_qa instead)

text: text on which to make inferences

question: in


launch_cluster#

Launch a compute cluster on the cloud

container_name: str = container to run on the launched cluster

cluster_name: str = name of the cluster of launch (optional; name is randomly generated by default)

cloud: str = cloud where the cluster is to be launched (aws, gcp, azure, etc.) (optional; aws by default)

accelerators: str = type of gpu needed on the cluster (optional; None by default)

use_spot: bool = whether to use spot instances or not (optional; True by default)

port: int = port to deploy the container on the running cluster (optional; 8080 by default)


launch_jupyter_notebook#

Launch jupyter_genie notebook

cluster_name: str = name of the cluster to run commands on


list_clusters#

List running clusters

username: str = username for which to list clusters


list_doc_files#

List document files matching a given file pattern

doc_name: str = document name

file_pattern: str = file pattern to list matching matching files


list_uploads#

List uploads

username: str = username for which to list uplaods


merge_doc_files#


merge_doc_info_with_quants#

Merge document info with quants extracted from documents

doc_name: str = document name

quants_file_pattern: str = file pattern to read quants


model_inference#

Make inferences from a pre-trained model

model_name: str = name of the model to use for making inferences

data_file: str = file path for the data on which to make inferences

output_file: str = file path for inference output


multiple_choice_qa#

Multiple-choice QA

context: str = context to use questions

question: str = question

choices: list = possible choices for the answer

multi_class: bool = whether multiple choices are possible or not


open_email_account#

Open an email account

first_name: str = first name

last_name: str = last name (optional)

email_provider: str = email provider (optional; tutanota by default)


parse_numeric_string#


plot_data#

Generate plots from input data

data: jsonList = input data to make plots from

cat_col: str = category column

val_col: str = value column

groupby: list = columns to group data by when making plots

plot_types: list = type of plots to make


query_gpt#

Query OpenAI models

context: str = context to pass to OpenAI

query: str = query


rank_answers_to_query#

Rank multiple answers to a given query

data: jsonList = a json list containing input data

query: str = query to answer on the data

answers: list = list of candidate answers to rank


rank_attributes_by_relevance#


read_file#

Read file

file: str = file to read


read_quants#


read_recipe#

Read saved recipe

username: str = username

filename: str = filename to read


read_synthesized_data#


read_taxonomy#

Read taxonomy


read_workspace#

Read workspace

username: str = username

filename: str = filename to read


reconstruct_orig_tables#

Reconstruct original tables from table cells


request_enterprise_subscription#

Request an enterprise subscription

use_case: str = description of use-case

budget: str = description of budget

org_name: str = name of organisation

email: str = email to get back to

phone: str = phone number (optional)


reset_password#

Reset password

email: str = email associated with the password

password: str = new password

username: str = user name


score_doc_text_similarity#

embed document data

doc_name: str = document name

file_pattern: str = file pattern to select input files

cols_to_use: list = columns to use for creating text embeddings

model: str = model to use (ada, transformers, etc) (optional)

chunk_size: int = number of rows to keep in one chunk when generating embeddings (optional)

frac_rows_to_keep: float = fraction of rows to keep ranked by similarity


score_relevance#

Score relevance of text segments to a given topic: Useful for identifying the most relevant text passages to a given topic.

topic: str = topic with which the relevance is to be determined

texts: list = list of text passages that are to be ranked by relevance to topic


score_text_similarity#

Score semantic text similarity for given set of documents and queries

docs: list = text passages to score similarity for

queries: list = queries with which to compute similarity of docs


search_web#

Search web for a given list of keyphrases

keyphrases: list = list of keyphrases to search the web for


segment_excel_cells#

Segment excel cells (deprecated)


segment_text#

Segment extracted text using OCR

doc_name: str = document name

file_pattern: str = file pattern to select input files


send_email#

Send email


slugify#

Slugify text

text: str = slugify text


standardise_data#

Standardise data: looks at all columns of the data together to homogenise the existing names,

and can also add new columns if needed

text_data: jsonList = input data

cols_to_std: list = columns to standardise

groupby_cols: list = columns to group input data by, so that a separate dataset will be created for each group

context_cols: list = columns to keep in the output data to track the source (e.g. pagenum, doc_name)


standardise_names#

Standardise names

text_data: jsonList = input data

text_col: str = text column containing names to standardise

groupby_cols: list = columns to group input data by, so that values will be estimated separately for each group

context_cols: list = columns to keep in the output data to track the source (e.g. pagenum, doc_name)


standardise_units#

Standardise units

text_data: jsonList = input data in which to standardise units

text_col: str = column containing unit info

groupby_cols: list = columns to group input data by, so that values will be estimated separately for each group

context_cols: list = columns to keep in the output data to track the source (e.g. pagenum, doc_name)


start_cluster#

Start a stopped cluster

username: str = user name

cluster_name: str = name of the cluster to run commands on


stop_cluster#

Stop cluster

cluster_name: str = name of the cluster to run commands on


structure_passage_quants#

Extract structured quantitative information from text passages of a document

doc_name: str = document name

overwrite: int = whether to overwrite existing output or not


structure_quants_pipeline#

Run quant-structuring pipeline all the way from text extraction from documents, to text translation, quant extraction and structuring

doc_name: str = document name

doc_ext: str = document extension

Additional task parameters (to be passed in task json)

cluster_name: str = name of the cluster to run task on

stop_cluster_after_job: int = whether to stop cluster after the job is done

delete_cluster_after_job: int = whether to delete cluster after the job is done


structure_query#

Structure query by relevance to meta-data

metadata: jsonList = meta-data of the table containing column names and their descriptions

query: str = query to be run on the data

max_context_len: int = maximum context length for LLM (optional)

calc_mode: str = how to perform calculations (optional)

  • sync: synchronous mode

  • async: asynchronous mode (default)

  • parallel: parallel mode


structure_tabular_quants#

Extract structured quantitative information from tables of a document

doc_name: str = document name

overwrite: int = whether to overwrite existing output or not


summarise_table#


summarise_text#


synthesize_qual_data#

Synthesize qualitative data extracted from documents: this will combine document level info with text extracted from document pages, and add text embeddings

doc_name: str = document name


synthesize_quant_data#

Synthesize quantitative data: this will synthesize document level info with quantiative data extracted from the document, and add text embeddings

doc_name: str = document name


terminate_cluster#

Terminate cluster

cluster_name: str = name of the cluster to run commands on


test_api#

Test if the API is working


train_llm#

Train an LLM

username: str = user name

model_name: str = model name to use for saving the model

doc_names: list = list of documents to use for training data

model: str = base model to use for training

training_formats: list = formats of training data to use


train_multimodal_model#


trans_report_segments#

Translate extracted text segments from documents

doc_name: str = document name

overwrite: int = whether to overwrite existing output or not


translate_text_pipeline#

Run text translation pipeline all the way from text extraction from documents to text segmentation and translation

doc_name: str = document name

doc_ext: str = document extension


upload_data#

Upload data

contents: list = contents to be uploaded

filenames: list = filenames to use for uploaded contents

username: str = name of the user uploading content


verify_company_quants#

Verify extracted quants from company disclosures

doc_name: str = document name

file_pattern: str = file pattern to select input files


verify_data#


verify_quants_company_info#


write_and_execute_code#

Write and execute code to perform a task

prompt: str = task prompt

system_message: str = system prompt

conversation_history: list = history of past messages (optional)

model: str = model name to use (optional)

api_key: str = api key for the model (optional)


write_doc_info_to_sql_table#


write_pdf_img#

Save images for all pdf pages in a document

doc_name: str = document name

dpi: int = dots per inch

fmt: str = image format

overwrite: int = whether to overwrite the existing output or not


write_recipe#

Write recipe

username: str = username

filename: str = recipe file name


write_to_file#

Write to file

content = content to write to file

file: str = file path to write content to to


write_to_sql_table#

Write data to sql table

data: jsonList = data that needs to be written to sql table


write_workspace#

Write workspace

username: str = username

filename: workspace file name