API Documentation

API Documentation#

add_embeddings_to_data#

Add embeddings to document files

data: jsonList = data for which to create embeddings

cols_to_use: list = columns to use for creating text embeddings

model: str = model to use (ada, transformers, etc) (optional)

chunk_size: int = number of rows to keep in one chunk when generating embeddings

aggregate_data#

Aggregate data to facilitate answering a given query

data: jsonList = input data

data_context: str = a short description of the table (optional)

cols_to_use: list = columns for which to generate meta-data (optional)

max_context_len: int = maximum context length for LLM (optional)

calc_mode: str = how to perform calculations (optional)

sync: synchronous mode
async: asynchronous mode (default)
parallel: parallel mode

check_file_exists#

Check if a file exists

file: str = file path

check_file_size#

Check file size

file: str = file for which to check size

classify_text#

Classify text

text: str = input text to classify

labels: list = a set of labels to classify text into

multi_class: bool = whether multiple labels are allowed or not

command_cluster#

Execute commands on cluster

cluster_name: str = name of the cluster to run commands on

commands: list = list of commands to run on cluster

compare_entities#

Compare strengths and weaknesses of entities (e.g. companies) by topics, based on an input dataset containing descriptions of relevant activties

cols_to_compare: list = columns of the input data to use to compare entities

entity_col: str = column containing entity names

topic_col: str = column containing topics by which to compare entities

groupby_cols: list = list of columns to group data by, so that the text in each group is summarised independently

query_cols: list = columns related to stranghths and weaknesses, e.g. description of strengths, description of weaknesses, etc.

output_cols: list = output column names

context_cols: list = columns to keep in the output data to track the source of summarise text (e.g. pagenum, doc_name)

max_context_len: int = maximum length of input text to be summarised together: input text will be broken into chunks of max_context_len

compare_entities_by_themes#

Compare similarities and differences between entities (e.g. companies) by topics, based on an input dataset containing descriptions of relevant activties

cols_to_compare: list = columns of the input data to use to compare entities

entity_col: str = column containing entity names

topic_col: str = column containing topics by which to compare entities

groupby_cols: list = list of columns to group data by, so that the text in each group is summarised independently

query_cols: list = columns related to stranghths and weaknesses, e.g. description of strengths, description of weaknesses, etc.

output_cols: list = output column names

context_cols: list = columns to keep in the output data to track the source of summarise text (e.g. pagenum, doc_name)

max_context_len: int = maximum length of input text to be summarised together: input text will be broken into chunks of max_context_len

convert_jsonl_to_csv#

Convert a json list to csv string

data: jsonList = data to convert to csv

convert_pdf_to_markdown#

Convert PDF file to markdown

convert_py_to_ipynb#

Convert python scripts to ipynb notebooks

git_username: str = git username

repo_name: str = git repo name

access_key: str = access key for git account

target_folder: str = target folder inside code repo where the files need to be converted (optional)—if not specified all .py files in the repo will be converted

branch_name: str = branch to use for converting py files to ipynb files (optional; main by default)

convert_units#

Convert units

src_unit: str = source unit

tgt_unit: str = target unit

correct_data_errors#

Correct errors in the data extracted. Given source and dervided columns, the function aims to find values in the derived columns that are inconsistent with information in source columns.

source_cols: list = list of column names to use as source data or ground trugh

derived_cols: list = list of columns derived from source data

create_dataset#

create_dataset_from_tables#

Create a dataset from tables extracted from a document

doc_name: str = document name

attrs: list = attributes/columns to have in the dataset

create_dataset_from_text#

Create a dataset from text passages extracted from a document

doc_name: str = document name

attrs: list = attributes/columns to have in the dataset

create_query_variants#

Create multiple variants of a data query based on meta-data

metadata: jsonList = a json list containing meta-data

query: str = query to run on the data

n_variants: int = number of query variants to generate

create_synonyms#

Create synonyms to an input text

text: str = text for which to create synonyms

create_text_embeddings#

Create text embeddings

docs: list = text passages to embed

model: str = model to use (ada, transformers, etc)

chunk_size: int = number of docs to keep in one chunk when generating embeddings

create_text_graph#

Create knowledge graph from text data

cols_to_use: list = columns to use in the data to create graph

groupby_cols: list = list of columns to group data by, so that the text in each group is summarised independently

output_cols: list = output column names

context_cols: list = columns to keep in the output data to track the source of summarise text (e.g. pagenum, doc_name)

max_context_len: int = maximum length of input text to be summarised together: input text will be broken into chunks of max_context_len

delete_workspace#

Delete workspace

username: str = username

filename: str = filename to read

download_document_urls#

Download document URLs for given entity names and document keyphrase

entity_names: list = list of entity/organisation names

doc_keywords: list = list of keywords to download documents by

download_documents#

Download documents from the homepage of organisations

entity_names: list = list of organisations to find homepage for

download_file#

Download a webpage/file from a list of urls

urls: list = list of urls to download

download_matching_files#

Download files matching a file extension from a list of urls

urls: list = list of urls from which to download matching files

embed_doc_data#

embed document data

doc_name: str = document name

file_pattern: str = file pattern to select input files

cols_to_use: list = columns to use for creating text embeddings

model: str = model to use (ada, transformers, etc) (optional)

chunk_size: int = number of rows to keep in one chunk when generating embeddings

estimate_values#

Estimate values of given attributes based on a dataset

text_data: jsonList = input data from which to estimate values

cols_to_use: list = columns to use in input data for estimating values

metrics_to_estimate: list = metrics to estimate

groupby_cols: list = columns to group input data by, so that values will be estimated separately for each group

context_cols: list = columns to keep in the output data to track the source (e.g. pagenum, doc_name)

extract_attributes#

Extract structured attribute-value pairs from unstructured text data

extract_doc_author#

Extract the author of the document

doc_name: str = name of the document

doc_ext: str = document extension

questions: list = questions to ask to identify document author

max_pages: int = maximum number of pages to consider in the document to identify document author

extract_doc_info#

Extract document info

doc_name: str = name of the document

doc_type_choices: str = document types to classify documents into

extract_doc_org#

Extract organisation name that published the document

doc_name: str = name of the document

doc_ext: str = document extension (optional)

questions: list = questions to ask to identify document organisation

max_pages: int = maximum number of pages to consider in the document to identify document organisation

extract_doc_type#

Classify document type into a pre-defined list of categories

doc_name: str = document name

doc_ext: str = document extension

questions: list = questions to ask to identify document type

doc_type_choices: list = list of document types to choose from

max_pages: int = maximum number of pages to consider in the document to identify document type

extract_doc_year#

Extract the document publication year

doc_name: str = document name

doc_ext: str = document extension (optional)

extract_num_pages#

Extract number of pages from a document

doc_name: str = document name

doc_ext: str = document extension

extract_orig_tables#

Extract tables in their original form from the document

doc_name: str = document name

doc_ext: str = document extension

extract_quant_metrics#

Extract structured quant metrics from unstructured text data

extract_text#

Extract text from images

extract_text_pipeline#

Run text extraction pipeline all the way from OCR to text segmentation and table re-construction

doc_name: str = document name

doc_ext: str = document extension (optional)

extract_text_years#

extractive_qa#

Extract answers to questions

context: str = context to use to answer the question

question: str = question

filter_columns#

Filter columns by relevance to a query

metadata: jsonList = meta-data of the table containing column names and their descriptions

query: str = query to be run on the data

max_context_len: int = maximum context length for LLM (optional)

calc_mode: str = how to perform calculations (optional)

sync: synchronous mode
async: asynchronous mode (default)
parallel: parallel mode

filter_data#

Filter data

data: jsonList = input data

query: str = filtering query

method: str = filtering method (optional)

query-relevance: filter data by relevance to the query
one-step: filter data based on all the conditions specified in the query in one step
multi-step: filter data based on each condition in the query, one step at a time (default)

calc_mode: str = how to perform calculations (optional)

sync: synchronous mode
async: asynchronous mode (default)
parallel: parallel mode

filter_similarity_scored_data#

find_homepage#

Find homepage for a list of organisations

entity_names: list = list of organisation names

format_recipe_tasks#

Format tasks in a recipe

recipe: str = recipe of tasks

line_break: str = line break separating various tasks (optional)

generate_api_key#

Generate api key

username: str = username

password: str = password

email: str = email

key_name: str = name for the key

generate_metadata#

Generate meta-data for tabular data

data: jsonList = input data

data_context: str = a short description of the table (optional)

cols_to_use: list = columns for which to generate meta-data (optional)

calc_mode: str = how to perform calculations (optional)

sync: synchronous mode
async: asynchronous mode (default)
parallel: parallel mode

generate_metadata#

Generate meta-data for tabular data

data: jsonList = input data

data_context: str = a short description of the table (optional)

cols_to_use: list = columns for which to generate meta-data (optional)

model: str = model to use for generating meta-data (optional)

calc_mode: str = how to perform calculations (optional)

sync: synchronous mode
async: asynchronous mode (default)
parallel: parallel mode

generate_signed_url#

Generate a pre-signed URL for a file

file: str = file for which to generate URL

time_ext: int = expiration time for the generated URL (optional)

generate_training_data#

Generate training data in desired format

doc_name: str = doc_name from which to generate training data

data_format: str = format of training data (masked-original-tables, masked-structured-data, data-structuring, recall-original-data, generative-qa)

masked_token: str = token to use for masked data (optional)

get_html_tables#

Extract HTML tables from a url

get_last_modified_time#

Get last modified time for a file

file: str = file path for which to get last modified time

get_usage_summary#

Calculates usage summary

username: str = username

start_time: int = start time for usage

end_time: int = end time for usage

route: str = usage route (‘api’, ‘app’, or ‘all’) (optional; ‘all’ by default)

group_topics#

Group topics, and return a dataframe with parent and child topics.

topics: list = list of input topics, which need to be grouped together

infer_from_text#

Make inferences on text (deprecated, use multiple_choice_qa instead)

text: text on which to make inferences

question: in

launch_cluster#

Launch a compute cluster on the cloud

container_name: str = container to run on the launched cluster

cluster_name: str = name of the cluster of launch (optional; name is randomly generated by default)

cloud: str = cloud where the cluster is to be launched (aws, gcp, azure, etc.) (optional; aws by default)

accelerators: str = type of gpu needed on the cluster (optional; None by default)

use_spot: bool = whether to use spot instances or not (optional; True by default)

port: int = port to deploy the container on the running cluster (optional; 8080 by default)

launch_jupyter_notebook#

Launch jupyter_genie notebook

cluster_name: str = name of the cluster to run commands on

list_clusters#

List running clusters

username: str = username for which to list clusters

list_doc_files#

List document files matching a given file pattern

doc_name: str = document name

file_pattern: str = file pattern to list matching matching files

list_uploads#

List uploads

username: str = username for which to list uplaods

merge_doc_files#

merge_doc_info_with_quants#

Merge document info with quants extracted from documents

doc_name: str = document name

quants_file_pattern: str = file pattern to read quants

model_inference#

Make inferences from a pre-trained model

model_name: str = name of the model to use for making inferences

data_file: str = file path for the data on which to make inferences

output_file: str = file path for inference output

multiple_choice_qa#

Multiple-choice QA

context: str = context to use questions

question: str = question

choices: list = possible choices for the answer

multi_class: bool = whether multiple choices are possible or not

open_email_account#

Open an email account

first_name: str = first name

last_name: str = last name (optional)

email_provider: str = email provider (optional; tutanota by default)

parse_numeric_string#

plot_data#

Generate plots from input data

data: jsonList = input data to make plots from

cat_col: str = category column

val_col: str = value column

groupby: list = columns to group data by when making plots

plot_types: list = type of plots to make

query_gpt#

Query OpenAI models

context: str = context to pass to OpenAI

query: str = query

rank_answers_to_query#

Rank multiple answers to a given query

data: jsonList = a json list containing input data

query: str = query to answer on the data

answers: list = list of candidate answers to rank

rank_attributes_by_relevance#

read_file#

Read file

file: str = file to read

read_quants#

read_recipe#

Read saved recipe

username: str = username

filename: str = filename to read

read_synthesized_data#

read_taxonomy#

Read taxonomy

read_workspace#

Read workspace

username: str = username

filename: str = filename to read

reconstruct_orig_tables#

Reconstruct original tables from table cells

request_enterprise_subscription#

Request an enterprise subscription

use_case: str = description of use-case

budget: str = description of budget

org_name: str = name of organisation

email: str = email to get back to

phone: str = phone number (optional)

reset_password#

Reset password

email: str = email associated with the password

password: str = new password

username: str = user name

score_doc_text_similarity#

embed document data

doc_name: str = document name

file_pattern: str = file pattern to select input files

cols_to_use: list = columns to use for creating text embeddings

model: str = model to use (ada, transformers, etc) (optional)

chunk_size: int = number of rows to keep in one chunk when generating embeddings (optional)

frac_rows_to_keep: float = fraction of rows to keep ranked by similarity

score_relevance#

Score relevance of text segments to a given topic: Useful for identifying the most relevant text passages to a given topic.

topic: str = topic with which the relevance is to be determined

texts: list = list of text passages that are to be ranked by relevance to topic

score_text_similarity#

Score semantic text similarity for given set of documents and queries

docs: list = text passages to score similarity for

queries: list = queries with which to compute similarity of docs

search_web#

Search web for a given list of keyphrases

keyphrases: list = list of keyphrases to search the web for

segment_excel_cells#

Segment excel cells (deprecated)

segment_text#

Segment extracted text using OCR

doc_name: str = document name

file_pattern: str = file pattern to select input files

send_email#

Send email

slugify#

Slugify text

text: str = slugify text

standardise_data#

Standardise data: looks at all columns of the data together to homogenise the existing names,

and can also add new columns if needed

text_data: jsonList = input data

cols_to_std: list = columns to standardise

groupby_cols: list = columns to group input data by, so that a separate dataset will be created for each group

context_cols: list = columns to keep in the output data to track the source (e.g. pagenum, doc_name)

standardise_names#

Standardise names

text_data: jsonList = input data

text_col: str = text column containing names to standardise

groupby_cols: list = columns to group input data by, so that values will be estimated separately for each group

context_cols: list = columns to keep in the output data to track the source (e.g. pagenum, doc_name)

standardise_units#

Standardise units

text_data: jsonList = input data in which to standardise units

text_col: str = column containing unit info

groupby_cols: list = columns to group input data by, so that values will be estimated separately for each group

context_cols: list = columns to keep in the output data to track the source (e.g. pagenum, doc_name)

start_cluster#

Start a stopped cluster

username: str = user name

cluster_name: str = name of the cluster to run commands on

stop_cluster#

Stop cluster

cluster_name: str = name of the cluster to run commands on

structure_passage_quants#

Extract structured quantitative information from text passages of a document

doc_name: str = document name

overwrite: int = whether to overwrite existing output or not

structure_quants_pipeline#

Run quant-structuring pipeline all the way from text extraction from documents, to text translation, quant extraction and structuring

doc_name: str = document name

doc_ext: str = document extension

Additional task parameters (to be passed in task json)

cluster_name: str = name of the cluster to run task on

stop_cluster_after_job: int = whether to stop cluster after the job is done

delete_cluster_after_job: int = whether to delete cluster after the job is done

structure_query#

Structure query by relevance to meta-data

metadata: jsonList = meta-data of the table containing column names and their descriptions

query: str = query to be run on the data

max_context_len: int = maximum context length for LLM (optional)

calc_mode: str = how to perform calculations (optional)

sync: synchronous mode
async: asynchronous mode (default)
parallel: parallel mode

structure_tabular_quants#

Extract structured quantitative information from tables of a document

doc_name: str = document name

overwrite: int = whether to overwrite existing output or not

summarise_table#

summarise_text#

synthesize_qual_data#

Synthesize qualitative data extracted from documents: this will combine document level info with text extracted from document pages, and add text embeddings

doc_name: str = document name

synthesize_quant_data#

Synthesize quantitative data: this will synthesize document level info with quantiative data extracted from the document, and add text embeddings

doc_name: str = document name

terminate_cluster#

Terminate cluster

cluster_name: str = name of the cluster to run commands on

test_api#

Test if the API is working

train_llm#

Train an LLM

username: str = user name

model_name: str = model name to use for saving the model

doc_names: list = list of documents to use for training data

model: str = base model to use for training

training_formats: list = formats of training data to use

train_multimodal_model#

trans_report_segments#

Translate extracted text segments from documents

doc_name: str = document name

overwrite: int = whether to overwrite existing output or not

translate_text_pipeline#

Run text translation pipeline all the way from text extraction from documents to text segmentation and translation

doc_name: str = document name

doc_ext: str = document extension

upload_data#

Upload data

contents: list = contents to be uploaded

filenames: list = filenames to use for uploaded contents

username: str = name of the user uploading content

verify_company_quants#

Verify extracted quants from company disclosures

doc_name: str = document name

file_pattern: str = file pattern to select input files

verify_data#

verify_quants_company_info#

write_and_execute_code#

Write and execute code to perform a task

prompt: str = task prompt

system_message: str = system prompt

conversation_history: list = history of past messages (optional)

model: str = model name to use (optional)

api_key: str = api key for the model (optional)

write_doc_info_to_sql_table#

write_pdf_img#

Save images for all pdf pages in a document

doc_name: str = document name

dpi: int = dots per inch

fmt: str = image format

overwrite: int = whether to overwrite the existing output or not

write_recipe#

Write recipe

username: str = username

filename: str = recipe file name

write_to_file#

Write to file

content = content to write to file

file: str = file path to write content to to

write_to_sql_table#

Write data to sql table

data: jsonList = data that needs to be written to sql table

write_workspace#

Write workspace

username: str = username

filename: workspace file name

API Documentation

Contents

API Documentation#

add_embeddings_to_data#

aggregate_data#

check_file_exists#

check_file_size#

classify_text#

command_cluster#

compare_entities#

compare_entities_by_themes#

convert_jsonl_to_csv#

convert_pdf_to_markdown#

convert_py_to_ipynb#

convert_units#

correct_data_errors#

create_dataset#

create_dataset_from_tables#

create_dataset_from_text#

create_query_variants#

create_synonyms#

create_text_embeddings#

create_text_graph#

delete_workspace#

download_document_urls#

download_documents#

download_file#

download_matching_files#

embed_doc_data#

estimate_values#

extract_attributes#

extract_doc_author#

extract_doc_info#

extract_doc_org#

extract_doc_type#

extract_doc_year#

extract_num_pages#

extract_orig_tables#

extract_quant_metrics#

extract_text#

extract_text_pipeline#

extract_text_years#

extractive_qa#

filter_columns#

filter_data#

filter_similarity_scored_data#

find_homepage#

format_recipe_tasks#

generate_api_key#

generate_metadata#

generate_metadata#

generate_signed_url#

generate_training_data#

get_html_tables#

get_last_modified_time#

get_usage_summary#

group_topics#

infer_from_text#

launch_cluster#

launch_jupyter_notebook#

list_clusters#

list_doc_files#

list_uploads#

merge_doc_files#

merge_doc_info_with_quants#

model_inference#

multiple_choice_qa#

open_email_account#

parse_numeric_string#

plot_data#

query_gpt#

rank_answers_to_query#

rank_attributes_by_relevance#

read_file#

read_quants#

read_recipe#

read_synthesized_data#

read_taxonomy#

read_workspace#

reconstruct_orig_tables#