API Documentation#
add_embeddings_to_data#
Add embeddings to document files
data: jsonList = data for which to create embeddings
cols_to_use: list = columns to use for creating text embeddings
model: str = model to use (ada, transformers, etc) (optional)
chunk_size: int = number of rows to keep in one chunk when generating embeddings
aggregate_data#
Aggregate data to facilitate answering a given query
data: jsonList = input data
data_context: str = a short description of the table (optional)
cols_to_use: list = columns for which to generate meta-data (optional)
max_context_len: int = maximum context length for LLM (optional)
calc_mode: str = how to perform calculations (optional)
sync: synchronous mode
async: asynchronous mode (default)
parallel: parallel mode
classify_text#
Classify text
text: str = input text to classify
labels: list = a set of labels to classify text into
multi_class: bool = whether multiple labels are allowed or not
command_cluster#
Execute commands on cluster
cluster_name: str = name of the cluster to run commands on
commands: list = list of commands to run on cluster
compare_entities#
Compare strengths and weaknesses of entities (e.g. companies) by topics, based on an input dataset containing descriptions of relevant activties
cols_to_compare: list = columns of the input data to use to compare entities
entity_col: str = column containing entity names
topic_col: str = column containing topics by which to compare entities
groupby_cols: list = list of columns to group data by, so that the text in each group is summarised independently
query_cols: list = columns related to stranghths and weaknesses, e.g. description of strengths, description of weaknesses, etc.
output_cols: list = output column names
context_cols: list = columns to keep in the output data to track the source of summarise text (e.g. pagenum, doc_name)
max_context_len: int = maximum length of input text to be summarised together: input text will be broken into chunks of max_context_len
compare_entities_by_themes#
Compare similarities and differences between entities (e.g. companies) by topics, based on an input dataset containing descriptions of relevant activties
cols_to_compare: list = columns of the input data to use to compare entities
entity_col: str = column containing entity names
topic_col: str = column containing topics by which to compare entities
groupby_cols: list = list of columns to group data by, so that the text in each group is summarised independently
query_cols: list = columns related to stranghths and weaknesses, e.g. description of strengths, description of weaknesses, etc.
output_cols: list = output column names
context_cols: list = columns to keep in the output data to track the source of summarise text (e.g. pagenum, doc_name)
max_context_len: int = maximum length of input text to be summarised together: input text will be broken into chunks of max_context_len
convert_pdf_to_markdown#
Convert PDF file to markdown
convert_py_to_ipynb#
Convert python scripts to ipynb notebooks
git_username: str = git username
repo_name: str = git repo name
access_key: str = access key for git account
target_folder: str = target folder inside code repo where the files need to be converted (optional)—if not specified all .py files in the repo will be converted
branch_name: str = branch to use for converting py files to ipynb files (optional; main by default)
correct_data_errors#
Correct errors in the data extracted. Given source and dervided columns, the function aims to find values in the derived columns that are inconsistent with information in source columns.
source_cols: list = list of column names to use as source data or ground trugh
derived_cols: list = list of columns derived from source data
create_dataset#
create_dataset_from_tables#
Create a dataset from tables extracted from a document
doc_name: str = document name
attrs: list = attributes/columns to have in the dataset
create_dataset_from_text#
Create a dataset from text passages extracted from a document
doc_name: str = document name
attrs: list = attributes/columns to have in the dataset
create_query_variants#
Create multiple variants of a data query based on meta-data
metadata: jsonList = a json list containing meta-data
query: str = query to run on the data
n_variants: int = number of query variants to generate
create_text_embeddings#
Create text embeddings
docs: list = text passages to embed
model: str = model to use (ada, transformers, etc)
chunk_size: int = number of docs to keep in one chunk when generating embeddings
create_text_graph#
Create knowledge graph from text data
cols_to_use: list = columns to use in the data to create graph
groupby_cols: list = list of columns to group data by, so that the text in each group is summarised independently
output_cols: list = output column names
context_cols: list = columns to keep in the output data to track the source of summarise text (e.g. pagenum, doc_name)
max_context_len: int = maximum length of input text to be summarised together: input text will be broken into chunks of max_context_len
download_document_urls#
Download document URLs for given entity names and document keyphrase
entity_names: list = list of entity/organisation names
doc_keywords: list = list of keywords to download documents by
download_documents#
Download documents from the homepage of organisations
entity_names: list = list of organisations to find homepage for
download_matching_files#
Download files matching a file extension from a list of urls
urls: list = list of urls from which to download matching files
embed_doc_data#
embed document data
doc_name: str = document name
file_pattern: str = file pattern to select input files
cols_to_use: list = columns to use for creating text embeddings
model: str = model to use (ada, transformers, etc) (optional)
chunk_size: int = number of rows to keep in one chunk when generating embeddings
estimate_values#
Estimate values of given attributes based on a dataset
text_data: jsonList = input data from which to estimate values
cols_to_use: list = columns to use in input data for estimating values
metrics_to_estimate: list = metrics to estimate
groupby_cols: list = columns to group input data by, so that values will be estimated separately for each group
context_cols: list = columns to keep in the output data to track the source (e.g. pagenum, doc_name)
extract_attributes#
Extract structured attribute-value pairs from unstructured text data
extract_doc_author#
Extract the author of the document
doc_name: str = name of the document
doc_ext: str = document extension
questions: list = questions to ask to identify document author
max_pages: int = maximum number of pages to consider in the document to identify document author
extract_doc_info#
Extract document info
doc_name: str = name of the document
doc_type_choices: str = document types to classify documents into
extract_doc_org#
Extract organisation name that published the document
doc_name: str = name of the document
doc_ext: str = document extension (optional)
questions: list = questions to ask to identify document organisation
max_pages: int = maximum number of pages to consider in the document to identify document organisation
extract_doc_type#
Classify document type into a pre-defined list of categories
doc_name: str = document name
doc_ext: str = document extension
questions: list = questions to ask to identify document type
doc_type_choices: list = list of document types to choose from
max_pages: int = maximum number of pages to consider in the document to identify document type
extract_doc_year#
Extract the document publication year
doc_name: str = document name
doc_ext: str = document extension (optional)
extract_num_pages#
Extract number of pages from a document
doc_name: str = document name
doc_ext: str = document extension
extract_orig_tables#
Extract tables in their original form from the document
doc_name: str = document name
doc_ext: str = document extension
extract_quant_metrics#
Extract structured quant metrics from unstructured text data
extract_text#
Extract text from images
extract_text_pipeline#
Run text extraction pipeline all the way from OCR to text segmentation and table re-construction
doc_name: str = document name
doc_ext: str = document extension (optional)
extract_text_years#
extractive_qa#
Extract answers to questions
context: str = context to use to answer the question
question: str = question
filter_columns#
Filter columns by relevance to a query
metadata: jsonList = meta-data of the table containing column names and their descriptions
query: str = query to be run on the data
max_context_len: int = maximum context length for LLM (optional)
calc_mode: str = how to perform calculations (optional)
sync: synchronous mode
async: asynchronous mode (default)
parallel: parallel mode
filter_data#
Filter data
data: jsonList = input data
query: str = filtering query
method: str = filtering method (optional)
query-relevance: filter data by relevance to the query
one-step: filter data based on all the conditions specified in the query in one step
multi-step: filter data based on each condition in the query, one step at a time (default)
calc_mode: str = how to perform calculations (optional)
sync: synchronous mode
async: asynchronous mode (default)
parallel: parallel mode
filter_similarity_scored_data#
find_homepage#
Find homepage for a list of organisations
entity_names: list = list of organisation names
format_recipe_tasks#
Format tasks in a recipe
recipe: str = recipe of tasks
line_break: str = line break separating various tasks (optional)
generate_api_key#
Generate api key
username: str = username
password: str = password
email: str = email
key_name: str = name for the key
generate_metadata#
Generate meta-data for tabular data
data: jsonList = input data
data_context: str = a short description of the table (optional)
cols_to_use: list = columns for which to generate meta-data (optional)
calc_mode: str = how to perform calculations (optional)
sync: synchronous mode
async: asynchronous mode (default)
parallel: parallel mode
generate_metadata#
Generate meta-data for tabular data
data: jsonList = input data
data_context: str = a short description of the table (optional)
cols_to_use: list = columns for which to generate meta-data (optional)
model: str = model to use for generating meta-data (optional)
calc_mode: str = how to perform calculations (optional)
sync: synchronous mode
async: asynchronous mode (default)
parallel: parallel mode
generate_signed_url#
Generate a pre-signed URL for a file
file: str = file for which to generate URL
time_ext: int = expiration time for the generated URL (optional)
generate_training_data#
Generate training data in desired format
doc_name: str = doc_name from which to generate training data
data_format: str = format of training data (masked-original-tables, masked-structured-data, data-structuring, recall-original-data, generative-qa)
masked_token: str = token to use for masked data (optional)
get_html_tables#
Extract HTML tables from a url
get_last_modified_time#
Get last modified time for a file
file: str = file path for which to get last modified time
get_usage_summary#
Calculates usage summary
username: str = username
start_time: int = start time for usage
end_time: int = end time for usage
route: str = usage route (‘api’, ‘app’, or ‘all’) (optional; ‘all’ by default)
group_topics#
Group topics, and return a dataframe with parent and child topics.
topics: list = list of input topics, which need to be grouped together
infer_from_text#
Make inferences on text (deprecated, use multiple_choice_qa instead)
text: text on which to make inferences
question: in
launch_cluster#
Launch a compute cluster on the cloud
container_name: str = container to run on the launched cluster
cluster_name: str = name of the cluster of launch (optional; name is randomly generated by default)
cloud: str = cloud where the cluster is to be launched (aws, gcp, azure, etc.) (optional; aws by default)
accelerators: str = type of gpu needed on the cluster (optional; None by default)
use_spot: bool = whether to use spot instances or not (optional; True by default)
port: int = port to deploy the container on the running cluster (optional; 8080 by default)
launch_jupyter_notebook#
Launch jupyter_genie notebook
cluster_name: str = name of the cluster to run commands on
list_doc_files#
List document files matching a given file pattern
doc_name: str = document name
file_pattern: str = file pattern to list matching matching files
merge_doc_files#
merge_doc_info_with_quants#
Merge document info with quants extracted from documents
doc_name: str = document name
quants_file_pattern: str = file pattern to read quants
model_inference#
Make inferences from a pre-trained model
model_name: str = name of the model to use for making inferences
data_file: str = file path for the data on which to make inferences
output_file: str = file path for inference output
multiple_choice_qa#
Multiple-choice QA
context: str = context to use questions
question: str = question
choices: list = possible choices for the answer
multi_class: bool = whether multiple choices are possible or not
open_email_account#
Open an email account
first_name: str = first name
last_name: str = last name (optional)
email_provider: str = email provider (optional; tutanota by default)
parse_numeric_string#
plot_data#
Generate plots from input data
data: jsonList = input data to make plots from
cat_col: str = category column
val_col: str = value column
groupby: list = columns to group data by when making plots
plot_types: list = type of plots to make
rank_answers_to_query#
Rank multiple answers to a given query
data: jsonList = a json list containing input data
query: str = query to answer on the data
answers: list = list of candidate answers to rank
rank_attributes_by_relevance#
read_quants#
read_synthesized_data#
read_taxonomy#
Read taxonomy
reconstruct_orig_tables#
Reconstruct original tables from table cells
request_enterprise_subscription#
Request an enterprise subscription
use_case: str = description of use-case
budget: str = description of budget
org_name: str = name of organisation
email: str = email to get back to
phone: str = phone number (optional)
reset_password#
Reset password
email: str = email associated with the password
password: str = new password
username: str = user name
score_doc_text_similarity#
embed document data
doc_name: str = document name
file_pattern: str = file pattern to select input files
cols_to_use: list = columns to use for creating text embeddings
model: str = model to use (ada, transformers, etc) (optional)
chunk_size: int = number of rows to keep in one chunk when generating embeddings (optional)
frac_rows_to_keep: float = fraction of rows to keep ranked by similarity
score_relevance#
Score relevance of text segments to a given topic: Useful for identifying the most relevant text passages to a given topic.
topic: str = topic with which the relevance is to be determined
texts: list = list of text passages that are to be ranked by relevance to topic
score_text_similarity#
Score semantic text similarity for given set of documents and queries
docs: list = text passages to score similarity for
queries: list = queries with which to compute similarity of docs
search_web#
Search web for a given list of keyphrases
keyphrases: list = list of keyphrases to search the web for
segment_excel_cells#
Segment excel cells (deprecated)
segment_text#
Segment extracted text using OCR
doc_name: str = document name
file_pattern: str = file pattern to select input files
send_email#
Send email
standardise_data#
Standardise data: looks at all columns of the data together to homogenise the existing names,
and can also add new columns if needed
text_data: jsonList = input data
cols_to_std: list = columns to standardise
groupby_cols: list = columns to group input data by, so that a separate dataset will be created for each group
context_cols: list = columns to keep in the output data to track the source (e.g. pagenum, doc_name)
standardise_names#
Standardise names
text_data: jsonList = input data
text_col: str = text column containing names to standardise
groupby_cols: list = columns to group input data by, so that values will be estimated separately for each group
context_cols: list = columns to keep in the output data to track the source (e.g. pagenum, doc_name)
standardise_units#
Standardise units
text_data: jsonList = input data in which to standardise units
text_col: str = column containing unit info
groupby_cols: list = columns to group input data by, so that values will be estimated separately for each group
context_cols: list = columns to keep in the output data to track the source (e.g. pagenum, doc_name)
start_cluster#
Start a stopped cluster
username: str = user name
cluster_name: str = name of the cluster to run commands on
structure_passage_quants#
Extract structured quantitative information from text passages of a document
doc_name: str = document name
overwrite: int = whether to overwrite existing output or not
structure_quants_pipeline#
Run quant-structuring pipeline all the way from text extraction from documents, to text translation, quant extraction and structuring
doc_name: str = document name
doc_ext: str = document extension
Additional task parameters (to be passed in task json)
cluster_name: str = name of the cluster to run task on
stop_cluster_after_job: int = whether to stop cluster after the job is done
delete_cluster_after_job: int = whether to delete cluster after the job is done
structure_query#
Structure query by relevance to meta-data
metadata: jsonList = meta-data of the table containing column names and their descriptions
query: str = query to be run on the data
max_context_len: int = maximum context length for LLM (optional)
calc_mode: str = how to perform calculations (optional)
sync: synchronous mode
async: asynchronous mode (default)
parallel: parallel mode
structure_tabular_quants#
Extract structured quantitative information from tables of a document
doc_name: str = document name
overwrite: int = whether to overwrite existing output or not
summarise_table#
summarise_text#
synthesize_qual_data#
Synthesize qualitative data extracted from documents: this will combine document level info with text extracted from document pages, and add text embeddings
doc_name: str = document name
synthesize_quant_data#
Synthesize quantitative data: this will synthesize document level info with quantiative data extracted from the document, and add text embeddings
doc_name: str = document name
test_api#
Test if the API is working
train_llm#
Train an LLM
username: str = user name
model_name: str = model name to use for saving the model
doc_names: list = list of documents to use for training data
model: str = base model to use for training
training_formats: list = formats of training data to use
train_multimodal_model#
trans_report_segments#
Translate extracted text segments from documents
doc_name: str = document name
overwrite: int = whether to overwrite existing output or not
translate_text_pipeline#
Run text translation pipeline all the way from text extraction from documents to text segmentation and translation
doc_name: str = document name
doc_ext: str = document extension
upload_data#
Upload data
contents: list = contents to be uploaded
filenames: list = filenames to use for uploaded contents
username: str = name of the user uploading content
verify_company_quants#
Verify extracted quants from company disclosures
doc_name: str = document name
file_pattern: str = file pattern to select input files
verify_data#
verify_quants_company_info#
write_and_execute_code#
Write and execute code to perform a task
prompt: str = task prompt
system_message: str = system prompt
conversation_history: list = history of past messages (optional)
model: str = model name to use (optional)
api_key: str = api key for the model (optional)
write_doc_info_to_sql_table#
write_pdf_img#
Save images for all pdf pages in a document
doc_name: str = document name
dpi: int = dots per inch
fmt: str = image format
overwrite: int = whether to overwrite the existing output or not
write_to_file#
Write to file
content = content to write to file
file: str = file path to write content to to
write_to_sql_table#
Write data to sql table
data: jsonList = data that needs to be written to sql table