|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from transformers import file_utils |
|
|
|
|
|
print(file_utils.default_cache_path) |
|
|
|
|
|
import sys, re |
|
|
import time |
|
|
import pandas as pd |
|
|
from tqdm import tqdm |
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
|
from collections import Counter |
|
|
|
|
|
from gliner import GLiNER, GLiNERConfig, data_processing |
|
|
|
|
|
|
|
|
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' |
|
|
|
|
|
import torch |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
import logging |
|
|
|
|
|
import tiktoken |
|
|
from langchain.text_splitter import TokenTextSplitter |
|
|
|
|
|
|
|
|
|
|
|
import requests |
|
|
|
|
|
import re |
|
|
|
|
|
from common import strtobool, split_camel_case, chunk_tokens, update_nested_dict, cleanInputText, token_counter, encoding_getter, extract_words, all_words_in_list, row_to_dict_string, strip_quotes, rescale_exponential_to_linear, rescale_exponential_to_logarithmic |
|
|
|
|
|
from accelerate import Accelerator |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
accelerator = Accelerator() |
|
|
|
|
|
device = accelerator.device |
|
|
print("Using accelerator device = "+ str(device)) |
|
|
|
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
from transformers import pipeline |
|
|
from transformers.pipelines.pt_utils import KeyDataset |
|
|
|
|
|
from virtuosoQueryRest import sparqlQuery |
|
|
from llmqueryNer import call_model, call_model_with_caching, process_list, setup_gptjrc, api_call_gptjrc, model_list_gptjrc |
|
|
|
|
|
import string |
|
|
import datasets |
|
|
|
|
|
import argparse |
|
|
import json |
|
|
import random |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
from retrieverRAG_SF import RAG_retrieval_Base |
|
|
|
|
|
from joblib import Memory |
|
|
|
|
|
cachedir = 'cached' |
|
|
mem = Memory(cachedir, verbose=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
POSSIBLE_KGchoices_List = sorted(["AI", "AIO", "AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI", "DCM", "FMA", "GO", "GENO", |
|
|
"GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH", "MONDO", "NCIT", |
|
|
"NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI", "OPB", "TRANS", |
|
|
"PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO", "SYMP", "FoodOn", "UBERON", "ORDO", "HOOM", |
|
|
"VO", "OGMS", "EuroSciVoc", "ARO", "REACTO", "go-lego", "go-lego-reacto", "PR", "PSIMOD", "pathway_like_go_cams"]) |
|
|
|
|
|
ONLY_Ontologies_OnBIOPORTAL = sorted(["AI", "AIO", "AEO", "BCGO", "BFO", "BIM", "CHEBI", "CHIRO", "CL", "DCM", "DOID", "FMA", "FOODON", "GENO", "GML", "GO", "GEOSPARQL", "HL7", "HP", "HP_O", "IAO", "ICD10", "IDO", "LOINC", "MESH", "MONDO", "NCBITAXON", "NCIT", "NIFCELL", "NIFSTD", "OBCS", "OCHV", "OHPI", "OPB", "PLOSTHES", "RADLEX", "OBOREL", "SNOMEDCT", "SO", "STATO", "STY", "SYMP", "PTRANS", "UBERON", "ORDO", "HOOM", "VO", "OGMS", "ARO", "PR", "PSIMOD"]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def default_serializer(obj): |
|
|
if isinstance(obj, np.floating): |
|
|
return float(obj) |
|
|
raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable") |
|
|
|
|
|
|
|
|
def is_json(myjson): |
|
|
try: |
|
|
|
|
|
json.dumps(myjson, default=default_serializer) |
|
|
except ValueError as e: |
|
|
return False |
|
|
return True |
|
|
|
|
|
|
|
|
def get_filtered_entities(txt_ents): |
|
|
filtered_ent_list = [] |
|
|
|
|
|
for ent_dct in txt_ents: |
|
|
if ent_dct["score"] > 0.0: |
|
|
if ent_dct["entity_group"]: |
|
|
filtered_ent_list.append(ent_dct) |
|
|
|
|
|
return filtered_ent_list |
|
|
|
|
|
|
|
|
def process_row_Gliner(args, tokenizerGliner, modelGlinerBio, modelGliner, glinerlabels, row): |
|
|
context_to_annotate = row[args.source_column] |
|
|
tokens = tokenizerGliner.tokenize(context_to_annotate) |
|
|
|
|
|
entities = [] |
|
|
offset = 0 |
|
|
|
|
|
if "gliner_large_bio" in args.model_id: |
|
|
max_chunk_length = modelGlinerBio.config.max_len |
|
|
else: |
|
|
max_chunk_length = modelGliner.config.max_len |
|
|
|
|
|
for chunk in chunk_tokens(tokens, (max_chunk_length - 1)): |
|
|
chunk_text = tokenizerGliner.convert_tokens_to_string(chunk) |
|
|
if "gliner_large_bio" in args.model_id: |
|
|
chunk_entities = modelGlinerBio.predict_entities(chunk_text, glinerlabels, |
|
|
threshold=args.entities_filter_threshold) |
|
|
else: |
|
|
chunk_entities = modelGliner.predict_entities(chunk_text, glinerlabels, |
|
|
threshold=args.entities_filter_threshold) |
|
|
|
|
|
adjusted_entities = [] |
|
|
for entity in chunk_entities: |
|
|
adjusted_entity = { |
|
|
'text': entity['text'], |
|
|
'score': entity['score'], |
|
|
'start': entity['start'] + offset, |
|
|
'end': entity['end'] + offset, |
|
|
'label': entity['label'] |
|
|
} |
|
|
adjusted_entities.append(adjusted_entity) |
|
|
|
|
|
entities.extend(adjusted_entities) |
|
|
offset += len(chunk_text) |
|
|
|
|
|
if entities and isinstance(entities, list): |
|
|
for d in entities: |
|
|
d['entity_group'] = d.pop('label') |
|
|
d['word'] = d.pop('text') |
|
|
d['entity_group'] = d['entity_group'].upper() |
|
|
|
|
|
return row.name, entities |
|
|
|
|
|
|
|
|
|
|
|
def process_row_BioPortal_api(args, key_bioportal, row): |
|
|
|
|
|
|
|
|
if isinstance(row, list) or isinstance(row, pd.Series): |
|
|
context_to_annotate = row[args.source_column] |
|
|
elif isinstance(row, str): |
|
|
context_to_annotate = row |
|
|
else: |
|
|
raise ValueError("Unsupported type for row. Expected list or string.") |
|
|
|
|
|
url="" |
|
|
if getattr(args, 'KG_restriction', None): |
|
|
|
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("--- BIOPORTAL: " + context_to_annotate) |
|
|
|
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("KG_restriction is provided and not empty:", args.KG_restriction) |
|
|
|
|
|
onto_clauses = "" |
|
|
for choice in args.KG_restriction: |
|
|
if choice == "SNOMED": |
|
|
choice="SNOMEDCT" |
|
|
elif choice == "RO": |
|
|
choice = "OBOREL" |
|
|
elif choice == "TRANS": |
|
|
choice = "PTRANS" |
|
|
elif choice == "FoodOn": |
|
|
choice = "FOODON" |
|
|
elif choice == "GeoSPARQL": |
|
|
choice = "GEOSPARQL" |
|
|
|
|
|
|
|
|
elif choice == "NCBITaxon_": |
|
|
choice = "NCBITAXON" |
|
|
if choice in ONLY_Ontologies_OnBIOPORTAL: |
|
|
onto_clauses=onto_clauses+choice+"," |
|
|
|
|
|
if onto_clauses and onto_clauses[-1] == ",": |
|
|
onto_clauses=onto_clauses[:-1] |
|
|
|
|
|
url = f"https://services.data.bioontology.org/annotatorplus/?text={context_to_annotate}&ontologies={onto_clauses}&longest_only=true&exclude_numbers=true&whole_word_only=true&exclude_synonyms=false&negation=false&experiencer=false&temporality=false&score_threshold=0&confidence_threshold=0&display_links=false&display_context=false&score=cvalue&apikey={key_bioportal}" |
|
|
|
|
|
else: |
|
|
|
|
|
kg_restriction = getattr(args, 'KG_restriction', None) |
|
|
if kg_restriction is not None and len(kg_restriction) == 0: |
|
|
print("KG_restriction is provided but empty") |
|
|
return pd.DataFrame() |
|
|
|
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("--- BIOPORTAL: " + context_to_annotate) |
|
|
print("KG_restriction is not provided or empty - Consider all the KGs") |
|
|
|
|
|
url = f"https://services.data.bioontology.org/annotatorplus/?text={context_to_annotate}&ontologies=AEO,BFO,BIM,BCGO,CL,CHIRO,CHEBI,DCM,FMA,GO,GENO,GEOSPARQL,HL7,DOID,HP,HP_O,IDO,IAO,ICD10,LOINC,MESH,MONDO,NCIT,NCBITAXON,NIFCELL,NIFSTD,GML,OBCS,OCHV,OHPI,OPB,PTRANS,PLOSTHES,RADLEX,OBOREL,STY,SO,SNOMEDCT,STATO,SYMP,FOODON,UBERON,VO&longest_only=true&exclude_numbers=true&whole_word_only=true&exclude_synonyms=false&negation=false&experiencer=false&temporality=false&score_threshold=0&confidence_threshold=0&display_links=false&display_context=false&score=cvalue&apikey={key_bioportal}" |
|
|
|
|
|
|
|
|
response = requests.get(url) |
|
|
|
|
|
try: |
|
|
data = response.json() |
|
|
|
|
|
if not data: |
|
|
|
|
|
return pd.DataFrame() |
|
|
|
|
|
dff = pd.DataFrame(data) |
|
|
dff = dff.drop(columns=['hierarchy', 'mappings']) |
|
|
|
|
|
|
|
|
expanded_annotated_class = pd.json_normalize(dff['annotatedClass']) |
|
|
|
|
|
expanded_annotations = pd.DataFrame(dff['annotations'].tolist(), index=dff.index) |
|
|
expanded_annotations = pd.json_normalize(expanded_annotations[0]) |
|
|
|
|
|
|
|
|
df_expanded = dff.drop(columns=['annotatedClass', 'annotations']).join(expanded_annotated_class).join( |
|
|
expanded_annotations) |
|
|
|
|
|
|
|
|
df_expanded['@id'] = df_expanded['@id'].str.replace( |
|
|
"http://purl.bioontology.org/ontology/SNOMEDCT/", |
|
|
"http://snomed.info/id/" |
|
|
) |
|
|
|
|
|
return df_expanded |
|
|
|
|
|
|
|
|
except Exception as err: |
|
|
logging.error( |
|
|
f'ERROR ON BioPortal Annotator API Call\n\tError: {err}\n TextToAnnotate: {context_to_annotate}\n Have a check...') |
|
|
return pd.DataFrame() |
|
|
|
|
|
|
|
|
def parallel_process_df_Gliner(args, df, tokenizerGliner, modelGlinerBio, modelGliner, glinerlabels): |
|
|
results = [] |
|
|
|
|
|
if args.num_cores_Gliner > 0: |
|
|
with ThreadPoolExecutor(max_workers=args.num_cores_Gliner) as executor: |
|
|
futures = [ |
|
|
executor.submit( |
|
|
process_row_Gliner, args, tokenizerGliner, modelGlinerBio, modelGliner, glinerlabels, row |
|
|
) |
|
|
for _, row in df.iterrows() |
|
|
] |
|
|
|
|
|
for future in tqdm(futures): |
|
|
drm_idx, entities = future.result() |
|
|
df.at[drm_idx, 'annotation'] = entities |
|
|
|
|
|
else: |
|
|
|
|
|
df['annotation'] = df.apply( |
|
|
lambda row: process_row_Gliner(args, tokenizerGliner, modelGlinerBio, modelGliner, glinerlabels, row)[1], |
|
|
axis=1 |
|
|
) |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
def annotate(df, args, pipeInner, tokenizerGliner, modelGliner, modelGlinerBio, device="cpu"): |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("\nAnnotate using " + args.model_id) |
|
|
print("device=" + str(device)) |
|
|
startAnnotate = time.time() |
|
|
|
|
|
if "gliner" in args.model_id: |
|
|
|
|
|
df['model'] = args.model_id |
|
|
df['annotation'] = None |
|
|
|
|
|
glinerlabels = ["location", "disease", "date", "numerical value", "number"] |
|
|
|
|
|
|
|
|
df = parallel_process_df_Gliner(args, df, tokenizerGliner, modelGlinerBio, modelGliner, glinerlabels) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df_annot = df.explode('annotation').dropna(subset=['annotation']).reset_index(drop=True) |
|
|
|
|
|
elif "NCBO" in args.model_id: |
|
|
|
|
|
|
|
|
|
|
|
key_bioportal = "" |
|
|
if args.bioportalkey_filename and os.path.exists(args.bioportalkey_filename): |
|
|
fkeyname = args.bioportalkey_filename |
|
|
with open(fkeyname) as f: |
|
|
key_bioportal = f.read() |
|
|
else: |
|
|
key_bioportal = os.environ['key_bioportal'] |
|
|
|
|
|
df_annot = pd.DataFrame() |
|
|
for drm_idx, row in tqdm(df.iterrows()): |
|
|
df_BioPortalAnnotation=process_row_BioPortal_api(args, key_bioportal, row) |
|
|
|
|
|
if not df_BioPortalAnnotation.empty: |
|
|
|
|
|
df_BioPortalAnnotation = df_BioPortalAnnotation.sort_values( |
|
|
by=['from', 'text', 'score', 'matchType'], ascending=[True, True, False, False]) |
|
|
|
|
|
df_biop_minimised = df_BioPortalAnnotation.copy() |
|
|
|
|
|
|
|
|
grouped_biop = df_biop_minimised.groupby(['from', 'to']) |
|
|
|
|
|
idx_biop = grouped_biop['score'].idxmax() |
|
|
|
|
|
df_max_score_biop = df_biop_minimised.loc[idx_biop] |
|
|
|
|
|
df_max_score_biop = df_max_score_biop.reset_index(drop=True) |
|
|
|
|
|
|
|
|
pippo_lists = grouped_biop['@id'].apply(list).reset_index() |
|
|
|
|
|
|
|
|
df_max_score_biop = df_max_score_biop.merge(pippo_lists, on=['from', 'to']) |
|
|
|
|
|
|
|
|
df_max_score_biop = df_max_score_biop.rename(columns={'@id_x': '@id'}) |
|
|
df_max_score_biop = df_max_score_biop.rename(columns={'@id_y': 'ALLURIScontextFromNCBO'}) |
|
|
|
|
|
|
|
|
|
|
|
df_max_score_biop = df_max_score_biop[df_max_score_biop['score'] > 3.0] |
|
|
|
|
|
if "semantic_groups" not in df_max_score_biop.columns: |
|
|
|
|
|
df_max_score_biop["semantic_groups"] = None |
|
|
|
|
|
|
|
|
columns_to_keep = ["score", "from", "to", "prefLabel", "text", "semantic_groups", "@id", "ALLURIScontextFromNCBO"] |
|
|
|
|
|
|
|
|
df_max_score_biop = df_max_score_biop[columns_to_keep] |
|
|
|
|
|
|
|
|
df_max_score_biop = df_max_score_biop.rename(columns={"from": "start", "to": "end", "text": "word", "semantic_groups": "entity_group"}) |
|
|
|
|
|
|
|
|
df_max_score_biop = df_max_score_biop.reset_index(drop=True) |
|
|
|
|
|
df_max_score_biop['score'] = df_max_score_biop['score'].round(2) |
|
|
|
|
|
|
|
|
|
|
|
df_max_score_biop['entity_group'] = df_max_score_biop['entity_group'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else (np.nan if x is None or (isinstance(x, float) and pd.isna(x)) else x)) |
|
|
|
|
|
|
|
|
|
|
|
all_empty_or_nan_or_empty_string = df_max_score_biop['entity_group'].replace('', pd.NA).isna().all() |
|
|
if not all_empty_or_nan_or_empty_string: |
|
|
|
|
|
|
|
|
|
|
|
min_score_biop = df_max_score_biop['score'].min() |
|
|
|
|
|
|
|
|
conditionBiop = (df_max_score_biop['entity_group'].isna()) & (df_max_score_biop['score'] == min_score_biop) |
|
|
df_max_score_biop = df_max_score_biop[~conditionBiop] |
|
|
|
|
|
|
|
|
df_max_score_biop['entity_group'] = df_max_score_biop['entity_group'].fillna('BIOP') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not df_max_score_biop.empty: |
|
|
row_df = pd.DataFrame([row] * len(df_max_score_biop), columns=row.index) |
|
|
row_df['model'] = args.model_id |
|
|
df_max_score_biop = pd.concat([row_df.reset_index(drop=True), df_max_score_biop.reset_index(drop=True)], |
|
|
axis=1) |
|
|
df_annot = pd.concat([df_annot, df_max_score_biop], ignore_index=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not df_annot.empty: |
|
|
|
|
|
mmax_score = df_annot['score'].max() |
|
|
mmin_score = df_annot['score'].min() |
|
|
|
|
|
if mmax_score == mmin_score: |
|
|
df_annot['score'] = 0.3 |
|
|
df_annot.loc[df_annot['score'].notnull(), 'score'] = 0.7 |
|
|
else: |
|
|
|
|
|
|
|
|
df_annot = rescale_exponential_to_logarithmic(df_annot, 'score', new_min=0.7, new_max=1.0) |
|
|
|
|
|
columnsDict = ['start', 'end', 'word', 'entity_group', 'score', 'prefLabel'] |
|
|
|
|
|
df_annot['annotation'] = df_annot.apply(row_to_dict_string, axis=1, columnsDict=columnsDict) |
|
|
|
|
|
|
|
|
df_annot['annotation'] = df_annot['annotation'].apply( |
|
|
lambda x: json.loads(x) if isinstance(x, str) else x |
|
|
) |
|
|
|
|
|
df_annot = df_annot.drop(columns=columnsDict) |
|
|
|
|
|
else: |
|
|
|
|
|
HF_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=df)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if strtobool(args.debug): |
|
|
print('Annotating texts...') |
|
|
|
|
|
annotated_texts = [ |
|
|
|
|
|
|
|
|
out for out in tqdm(pipeInner(KeyDataset(HF_dataset, args.source_column), batch_size=args.batch_size)) |
|
|
] |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print('looping annotations...') |
|
|
|
|
|
|
|
|
df['model'] = args.model_id |
|
|
df['annotation'] = annotated_texts |
|
|
|
|
|
df_annot = df.explode('annotation').dropna(subset=['annotation']).reset_index(drop=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if strtobool(args.debug): |
|
|
endAnnotate = time.time() |
|
|
hours, rem = divmod(endAnnotate - startAnnotate, 3600) |
|
|
minutes, seconds = divmod(rem, 60) |
|
|
print("...end annotation - Time... {:0>2}:{:0>2}:{:05.2f}\n".format(int(hours), int(minutes), seconds)) |
|
|
print('\n') |
|
|
|
|
|
return df_annot |
|
|
|
|
|
|
|
|
def is_cross_inside(df_sorted, args, valuecutCross=0.75): |
|
|
|
|
|
df_sorted['IsCrossInside'] = 0 |
|
|
|
|
|
df_sorted = df_sorted.reset_index(drop=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
indexes_list = [] |
|
|
scores_list = [] |
|
|
IsToLinkContained = [] |
|
|
for i, row_outer in tqdm(df_sorted.iterrows()): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(indexes_list)==0: |
|
|
scores_list.append(row_outer['score']) |
|
|
indexes_list.append(i) |
|
|
if (pd.isnull(row_outer['ToLink']==False) and len(row_outer['ToLink'])>0): |
|
|
IsToLinkContained.append(True) |
|
|
else: |
|
|
IsToLinkContained.append(False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
if i in indexes_list: |
|
|
if (i == indexes_list[-1]): |
|
|
|
|
|
|
|
|
|
|
|
indexes_list = [indexes_list[-1]] |
|
|
scores_list = [scores_list[-1]] |
|
|
IsToLinkContained = [IsToLinkContained[-1]] |
|
|
else: |
|
|
continue |
|
|
|
|
|
|
|
|
for j in range(i + 1, len(df_sorted)): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
row_inner = df_sorted.iloc[j] |
|
|
|
|
|
|
|
|
if row_inner['SentenceRef'] != row_outer['SentenceRef']: |
|
|
break |
|
|
elif row_inner['start'] >= row_outer['end']: |
|
|
break |
|
|
else: |
|
|
scores_list.append(row_inner['score']) |
|
|
indexes_list.append(j) |
|
|
if (pd.isnull(row_inner['ToLink'] == False) and len(row_inner['ToLink']) > 0): |
|
|
IsToLinkContained.append(True) |
|
|
else: |
|
|
IsToLinkContained.append(False) |
|
|
|
|
|
if len(indexes_list)>1: |
|
|
first_true_index = -1 |
|
|
try: |
|
|
first_true_index = IsToLinkContained.index(True) |
|
|
|
|
|
except ValueError: |
|
|
first_true_index = -1 |
|
|
|
|
|
|
|
|
topinlist=-1 |
|
|
if first_true_index >=0: |
|
|
topinlist = first_true_index |
|
|
else: |
|
|
topinlist = scores_list.index(max(scores_list)) |
|
|
|
|
|
|
|
|
if topinlist >= 0: |
|
|
for xx in range(0, len(indexes_list)): |
|
|
if xx == topinlist: |
|
|
continue |
|
|
df_sorted.at[indexes_list[xx], 'IsCrossInside'] = 1 |
|
|
|
|
|
else: |
|
|
indexes_list = [] |
|
|
scores_list = [] |
|
|
IsToLinkContained = [] |
|
|
|
|
|
|
|
|
if not df_sorted.empty: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
condition = df_sorted.apply(lambda row: |
|
|
(row['IsCrossInside'] == 0) or |
|
|
( (row['IsCrossInside'] == 1) and (row['score'] >=valuecutCross )), |
|
|
axis=1) |
|
|
|
|
|
|
|
|
df_sorted = df_sorted[condition] |
|
|
|
|
|
return df_sorted |
|
|
|
|
|
def entitiesFusion(df_annotated, args): |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("\nStart entities fusion and filtering ...") |
|
|
|
|
|
areJson = df_annotated["annotation"].apply(is_json) |
|
|
if False in areJson.unique(): |
|
|
for idxr, rr in df_annotated.iterrows(): |
|
|
|
|
|
if areJson[idxr] == False: |
|
|
print("PROBLEM WITH JSON AT INDEX " + str(idxr) + ":\n" + df_annotated["annotation"][idxr]) |
|
|
replacement_empty_myjson = '{\"entity_group\": \"\", \"score\": \"\", "word": \"\", \"start\": \"\", \"end\": \"\"}' |
|
|
df_annotated.at[idxr, "annotation"] = replacement_empty_myjson |
|
|
print(" ...... Then replacing it with empty JSON --> " + df_annotated["annotation"][idxr]) |
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df_extract = df_annotated.apply(lambda x: pd.Series(x['annotation'].values(), |
|
|
index=x['annotation'].keys()), axis=1) |
|
|
|
|
|
|
|
|
if '@id' in df_extract.columns: |
|
|
|
|
|
df_extract = df_extract.drop(columns='@id') |
|
|
|
|
|
df_annotated = pd.merge(df_annotated, df_extract, left_index=True, right_index=True) |
|
|
|
|
|
except Exception as err: |
|
|
logging.error( |
|
|
f'FAILED to extract json results\n\tError: {err}\nLeaving it as a single column then and not decompressing! Have a check...') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
condition_to_delete = ( |
|
|
df_annotated[args.source_column].str.startswith('"') & |
|
|
df_annotated[args.source_column].str.endswith('"') & |
|
|
(df_annotated[args.source_column].apply(strip_quotes).str.lower() != df_annotated['word'].str.lower()) |
|
|
) |
|
|
|
|
|
|
|
|
df_annotated = df_annotated[~condition_to_delete].copy() |
|
|
|
|
|
|
|
|
|
|
|
if args.entities_filter_threshold > 0: |
|
|
|
|
|
df_annotated = df_annotated[df_annotated['score'] > args.entities_filter_threshold] |
|
|
if df_annotated.empty: |
|
|
return df_annotated |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df_annotated.loc[ |
|
|
(~df_annotated['ToLink'].isnull()) & ( |
|
|
df_annotated['ToLink'].str.casefold() != df_annotated['word'].str.casefold()), 'ToLink'] = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if "IsGeo" not in df_annotated.columns: |
|
|
|
|
|
df_annotated.loc[:, "IsGeo"] = None |
|
|
if "IsBio" not in df_annotated.columns: |
|
|
|
|
|
df_annotated.loc[:, "IsBio"] = None |
|
|
|
|
|
df_annotated.loc[df_annotated['entity_group'] == 'LOCATION', 'entity_group'] = "LOC" |
|
|
df_annotated.loc[df_annotated['entity_group'] == 'LOC', 'IsGeo'] = 1 |
|
|
|
|
|
|
|
|
df_annotated.loc[df_annotated['entity_group'].str.lower().str.contains('disease'), 'IsBio'] = 1 |
|
|
df_annotated.loc[(df_annotated['model'].str.contains('Medical-NER')) & ( |
|
|
df_annotated['entity_group'].isin(['LOC', 'DATE', 'PER', 'ORG', 'DOSAGE', 'LAB_VALUE', 'DURATION']) == False), 'IsBio'] = 1 |
|
|
df_annotated.loc[(df_annotated['model'].str.contains('NCBO')) & ( |
|
|
df_annotated['entity_group'].isin(['CONC']) == False), 'IsBio'] = 1 |
|
|
|
|
|
|
|
|
df_annotated.loc[df_annotated['model'].str.lower().str.contains('ncbo'), 'start'] -= 1 |
|
|
|
|
|
|
|
|
df_annotated.loc[(df_annotated['model'] == 'blaze999/Medical-NER') & |
|
|
df_annotated.apply(lambda row: row[args.source_column][row['start']] == ' ', |
|
|
axis=1), 'start'] += 1 |
|
|
|
|
|
|
|
|
df_annotated.loc[df_annotated['model'].str.lower().str.contains('gliner') & |
|
|
df_annotated.apply(lambda row: row[args.source_column][row['start']] == ' ', |
|
|
axis=1), 'end'] += 1 |
|
|
df_annotated.loc[df_annotated['model'].str.lower().str.contains('gliner') & |
|
|
df_annotated.apply(lambda row: row[args.source_column][row['start']] == ' ', |
|
|
axis=1), 'start'] += 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df_annotated['extracted_words'] = df_annotated[args.source_column].apply(extract_words,putInLower=True) |
|
|
|
|
|
df_annotated = df_annotated[df_annotated.apply(lambda row: all_words_in_list(row['word'], row['extracted_words'], putInLower=True), axis=1)] |
|
|
|
|
|
df_annotated = df_annotated.drop(columns=['extracted_words']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df_annotated = df_annotated.sort_values(by='ToLink', ascending=False, na_position='last') |
|
|
|
|
|
for col in df_annotated.columns: |
|
|
if df_annotated[col].apply(lambda x: isinstance(x, dict)).any(): |
|
|
if strtobool(args.debug): |
|
|
print( |
|
|
f"Column '{col}' contains dictionaries...converting it to strings otherwise it will not work the concat etc..") |
|
|
df_annotated[col] = df_annotated[col].apply(lambda x: str(x)) |
|
|
|
|
|
df_annotated = df_annotated.drop_duplicates(subset=[col for col in df_annotated.columns if |
|
|
col != 'ToLink' and col != 'ALLURIScontextFromNCBO' and not df_annotated[col].apply( |
|
|
lambda x: isinstance(x, dict)).any()], keep='first') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df_annotated_Geo = df_annotated.loc[df_annotated.groupby( |
|
|
['SentenceRef', args.source_column, 'end', 'start', df_annotated['word'].str.lower(), 'IsGeo'])[ |
|
|
'score'].idxmax()] |
|
|
df_annotated_Bio = df_annotated.loc[df_annotated.groupby( |
|
|
['SentenceRef', args.source_column, 'end', 'start', df_annotated['word'].str.lower(), 'IsBio'])[ |
|
|
'score'].idxmax()] |
|
|
df_annotated_all = df_annotated.loc[ |
|
|
df_annotated.groupby(['SentenceRef', args.source_column, 'end', 'start', df_annotated['word'].str.lower()])[ |
|
|
'score'].idxmax()] |
|
|
|
|
|
|
|
|
df_annotated_combined = pd.concat([df_annotated_Geo, df_annotated_Bio, df_annotated_all]) |
|
|
df_annotated_combined = df_annotated_combined.drop_duplicates(subset=[col for col in df_annotated_combined.columns if |
|
|
col != 'ToLink' and col != 'ALLURIScontextFromNCBO' and not df_annotated_combined[col].apply( |
|
|
lambda x: isinstance(x, dict)).any()], keep='first') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df_annotated_combined.loc[:, "IsBioGeo"] = df_annotated_combined.loc[:, 'IsGeo'].infer_objects(copy=False).fillna(0) + df_annotated_combined.loc[:, 'IsBio'].infer_objects(copy=False).fillna(0) |
|
|
|
|
|
df_annotated_combined = df_annotated_combined.loc[df_annotated_combined.groupby( |
|
|
['SentenceRef', args.source_column, 'end', 'start', df_annotated['word'].str.lower()])['IsBioGeo'].idxmax()] |
|
|
|
|
|
df_annotated_combined = df_annotated_combined.loc[ |
|
|
df_annotated_combined.groupby( |
|
|
['SentenceRef', args.source_column, 'end', 'start', df_annotated['word'].str.lower(), 'IsBioGeo'])[ |
|
|
'score'].idxmax()] |
|
|
|
|
|
df_annotated_combined = df_annotated_combined.drop('IsBioGeo', axis=1) |
|
|
df_annotated_combined.loc[df_annotated_combined['IsBio'] == 0, 'IsBio'] = None |
|
|
df_annotated_combined.loc[df_annotated_combined['IsGeo'] == 0, 'IsGeo'] = None |
|
|
|
|
|
df_annotated_combined = df_annotated_combined.sort_values(by=['SentenceRef', 'start', 'ToLink', 'word', 'score'], ascending=[True, True, True, True, False]) |
|
|
|
|
|
|
|
|
return df_annotated_combined |
|
|
|
|
|
|
|
|
def geonames_api_call(word,args, key_geonames,cache_map_geonames): |
|
|
|
|
|
context = "" |
|
|
singleContext = None |
|
|
globalContext = None |
|
|
singleTriples = None |
|
|
globalTriples = None |
|
|
|
|
|
if cache_map_geonames is not None: |
|
|
if word in cache_map_geonames: |
|
|
if context in cache_map_geonames[word]: |
|
|
url_text = cache_map_geonames[word][context] |
|
|
if strtobool(args.debug): |
|
|
print("RETRIEVED CACHED RESULT FOR:\n", word, " => ", url_text, "\n") |
|
|
return url_text, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames |
|
|
|
|
|
|
|
|
url = f"http://api.geonames.org/search?name_equals={word}&maxRows=1&type=json&username={key_geonames}" |
|
|
response = requests.get(url) |
|
|
|
|
|
try: |
|
|
data = response.json() |
|
|
if data['geonames']: |
|
|
|
|
|
geonameId = data['geonames'][0]['geonameId'] |
|
|
geonameUrl = "https://sws.geonames.org/" + str(geonameId) + "/" |
|
|
|
|
|
if cache_map_geonames is not None: |
|
|
if not word in cache_map_geonames: |
|
|
cache_map_geonames[word] = {} |
|
|
cache_map_geonames[word][context] = geonameUrl |
|
|
|
|
|
return geonameUrl, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames |
|
|
else: |
|
|
|
|
|
if cache_map_geonames is not None: |
|
|
if not word in cache_map_geonames: |
|
|
cache_map_geonames[word] = {} |
|
|
cache_map_geonames[word][context] = None |
|
|
|
|
|
return None, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames |
|
|
|
|
|
except Exception as err: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return None, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, args, map_query_input_output, cleanInput=True, questionText=""): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
word = word.lower() |
|
|
word = word.capitalize() |
|
|
|
|
|
labelTriples="" |
|
|
|
|
|
if labelTriplesLIST and getattr(args, 'maxTriplesContextComputation', None): |
|
|
if args.maxTriplesContextComputation > 0: |
|
|
if len(labelTriplesLIST) > args.maxTriplesContextComputation: |
|
|
labelTriplesLIST = labelTriplesLIST[:args.maxTriplesContextComputation] |
|
|
|
|
|
if (strtobool(args.UseRetrieverForContextCreation) == True): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("Start reranking2 - num passages : ", len(labelTriplesLIST), "\n") |
|
|
startRerank2 = time.time() |
|
|
|
|
|
labelTriples = "" |
|
|
|
|
|
try: |
|
|
|
|
|
passages = [] |
|
|
for i, triple in enumerate(labelTriplesLIST, start=1): |
|
|
|
|
|
TriplesString = (" ".join(str(element).capitalize() for element in triple)) |
|
|
passages.append(TriplesString) |
|
|
|
|
|
nback = 1 |
|
|
if len(passages) <= 10: |
|
|
nback = len(passages) |
|
|
elif len(passages) <= 1000: |
|
|
nback = 10+int(0.1 * len(passages)) |
|
|
elif len(passages) <= 5000: |
|
|
nback = 200 |
|
|
elif len(passages) <= 10000: |
|
|
nback = 300 |
|
|
else: |
|
|
nback = 400 |
|
|
|
|
|
df_retrieved = RAG_retrieval_Base(questionText, passages, min_threshold=0, max_num_passages=nback) |
|
|
|
|
|
if not df_retrieved.empty: |
|
|
|
|
|
countRetr = 0 |
|
|
min_threshold = 0.80 |
|
|
countRetr = (df_retrieved['score'] > min_threshold).sum() |
|
|
|
|
|
countRetrThreshold = int(nback / 2) |
|
|
if nback > 10: |
|
|
countRetrThreshold = 10 |
|
|
else: |
|
|
countRetrThreshold = int(nback/2) |
|
|
if countRetrThreshold <=0: |
|
|
countRetrThreshold = 1 |
|
|
|
|
|
while countRetr <= countRetrThreshold: |
|
|
min_threshold = min_threshold - 0.05 |
|
|
countRetr = (df_retrieved['score'] >= min_threshold).sum() |
|
|
if min_threshold < 0.2: |
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if countRetr > 0: |
|
|
df_retrieved = df_retrieved[df_retrieved['score'] > min_threshold] |
|
|
|
|
|
|
|
|
labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist() |
|
|
labelTriplesAPP = ". ".join( |
|
|
" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED) |
|
|
|
|
|
if not labelTriples: |
|
|
labelTriples = labelTriplesAPP |
|
|
else: |
|
|
labelTriples = labelTriples + ". " + labelTriplesAPP |
|
|
|
|
|
else: |
|
|
labelTriplesLIST_RAGGED = [] |
|
|
labelTriples = "" |
|
|
|
|
|
|
|
|
if strtobool(args.debug): |
|
|
numfinal = 0 |
|
|
if labelTriplesLIST_RAGGED: |
|
|
numfinal = len(labelTriplesLIST_RAGGED) |
|
|
print("End reranking2 - found final passages : ", numfinal, "\n") |
|
|
endRerank2 = time.time() |
|
|
hours, rem = divmod(endRerank2 - startRerank2, 3600) |
|
|
minutes, seconds = divmod(rem, 60) |
|
|
print("Rerank2 Time... {:0>2}:{:0>2}:{:05.2f}\n".format(int(hours), int(minutes), seconds)) |
|
|
|
|
|
|
|
|
except Exception as err: |
|
|
print("SOMETHING HAPPENED on PASSAGE RERANKING for Question :"+questionText+"\n") |
|
|
print(err) |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
labelTriples = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST) |
|
|
|
|
|
|
|
|
if not(labelTriples) or labelTriples.strip=="": |
|
|
logging.warning("getLinearTextualContextFromTriples - No text or prompt supplied! No relevant contextual triples retrieved...Skypping it! Word: "+str(word)) |
|
|
return "", map_query_input_output |
|
|
|
|
|
if token_counter(labelTriples, args.model_name) > args.tokens_max: |
|
|
texts = text_splitter.create_documents([labelTriples]) |
|
|
labelTriples = texts[0].page_content |
|
|
if not (labelTriples) or labelTriples.strip == "": |
|
|
logging.warning("after splitting ...No text or prompt supplied! Skypping it! Word: "+str(word)) |
|
|
return "", map_query_input_output |
|
|
|
|
|
|
|
|
contextText = "" |
|
|
|
|
|
if (strtobool(args.UseRetrieverForContextCreation) == True): |
|
|
|
|
|
contextText = labelTriples |
|
|
|
|
|
else: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
myPromt = f""" |
|
|
Can you reformulate the following notes, provided between triple backticks, into clear and complete sentences about "{word}"? |
|
|
Ensure the rewriting is human-readable and easily interpretable. Maintain conciseness and exhaustiveness, including all information from the notes. |
|
|
Avoid using note formats or lists, and refrain from inventing additional information. |
|
|
""" |
|
|
myDelimiter = "```" |
|
|
|
|
|
if cleanInput==True: |
|
|
labelTriples = cleanInputText(labelTriples) |
|
|
|
|
|
|
|
|
|
|
|
if map_query_input_output is not None: |
|
|
key = args.model_name + "__" + str(args.temperature) + "__" + myPromt |
|
|
|
|
|
if key in map_query_input_output: |
|
|
if labelTriples in map_query_input_output[key]: |
|
|
output = map_query_input_output[key][labelTriples] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("RETRIEVED CACHED RESULT FOR:\n", myPromt, "\n", myDelimiter, word, myDelimiter, "\n=>\n", output, "\n") |
|
|
|
|
|
return output, map_query_input_output |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
contextText = "" |
|
|
if args.service_provider == "gptjrc": |
|
|
contextText = call_model(input_text=labelTriples, prompt=myPromt, model=args.model_name, |
|
|
temperature=args.temperature, delimiter=myDelimiter, |
|
|
InContextExamples=[], |
|
|
handler=api_call_gptjrc, |
|
|
verbose=True, args=args) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if contextText: |
|
|
if not isinstance(contextText, str): |
|
|
contextText = contextText['choices'][0]['message']['content'] |
|
|
|
|
|
if map_query_input_output is not None: |
|
|
if not key in map_query_input_output: |
|
|
map_query_input_output[key] = {} |
|
|
|
|
|
if contextText: |
|
|
if contextText != "": |
|
|
map_query_input_output[key][labelTriples] = contextText |
|
|
|
|
|
|
|
|
except Exception as err: |
|
|
return None, map_query_input_output |
|
|
|
|
|
|
|
|
|
|
|
return contextText, map_query_input_output |
|
|
|
|
|
|
|
|
|
|
|
def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=False, questionText="" ): |
|
|
|
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("--- start getUrlBioAndAllOtherBioConcepts for " + word.lower()) |
|
|
|
|
|
entityBioeUrl = None |
|
|
ALLURIScontext = [] |
|
|
|
|
|
key_bioportal = "" |
|
|
if args.bioportalkey_filename and os.path.exists(args.bioportalkey_filename): |
|
|
fkeyname = args.bioportalkey_filename |
|
|
with open(fkeyname) as f: |
|
|
key_bioportal = f.read() |
|
|
else: |
|
|
key_bioportal = os.environ['key_bioportal'] |
|
|
|
|
|
|
|
|
if getattr(args, 'KG_restriction', None): |
|
|
|
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("--- " + word.lower()) |
|
|
|
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("KG_restriction is provided and not empty:", args.KG_restriction) |
|
|
|
|
|
from_clauses = ' '.join([f"FROM <{choice}>" for choice in args.KG_restriction]) |
|
|
|
|
|
|
|
|
query = f""" |
|
|
prefix skosxl: <http://www.w3.org/2008/05/skos-xl#> |
|
|
SELECT ?concept ?label (COUNT(?edge) AS ?score) |
|
|
{from_clauses} |
|
|
WHERE {{ |
|
|
?concept skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?label . |
|
|
FILTER (LCASE(STR(?label)) = "{word.lower()}") |
|
|
?concept ?edge ?o . |
|
|
}} |
|
|
GROUP BY ?concept ?label |
|
|
ORDER BY DESC(?score) |
|
|
""" |
|
|
|
|
|
|
|
|
onto_clauses = "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for choice in args.KG_restriction: |
|
|
if choice == "SNOMED": |
|
|
choice="SNOMEDCT" |
|
|
elif choice == "RO": |
|
|
choice = "OBOREL" |
|
|
elif choice == "TRANS": |
|
|
choice = "PTRANS" |
|
|
elif choice == "FoodOn": |
|
|
choice = "FOODON" |
|
|
elif choice == "GeoSPARQL": |
|
|
choice = "GEOSPARQL" |
|
|
|
|
|
|
|
|
elif choice == "NCBITaxon_": |
|
|
choice = "NCBITAXON" |
|
|
if choice in ONLY_Ontologies_OnBIOPORTAL: |
|
|
onto_clauses=onto_clauses+choice+"," |
|
|
|
|
|
if onto_clauses and onto_clauses[-1] == ",": |
|
|
onto_clauses = onto_clauses[:-1] |
|
|
|
|
|
url = f"https://services.data.bioontology.org/annotatorplus/?text={word.lower()}&ontologies={onto_clauses}&longest_only=true&exclude_numbers=true&whole_word_only=true&exclude_synonyms=false&negation=false&experiencer=false&temporality=false&score_threshold=0&confidence_threshold=0&display_links=false&display_context=false&score=cvalue&apikey={key_bioportal}" |
|
|
|
|
|
else: |
|
|
|
|
|
|
|
|
kg_restriction = getattr(args, 'KG_restriction', None) |
|
|
if kg_restriction is not None and len(kg_restriction) == 0: |
|
|
print("KG_restriction is provided but empty") |
|
|
return None, None, cache_map_virtuoso |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("--- " + word.lower()) |
|
|
print("KG_restriction is not provided or empty - Consider all the KGs in the virtuoso endpoint") |
|
|
|
|
|
query = f""" |
|
|
prefix skosxl: <http://www.w3.org/2008/05/skos-xl#> |
|
|
SELECT ?concept ?label (COUNT(?edge) AS ?score) |
|
|
WHERE {{ |
|
|
?concept skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?label . |
|
|
FILTER (LCASE(STR(?label)) = "{word.lower()}") |
|
|
?concept ?edge ?o . |
|
|
}} |
|
|
GROUP BY ?concept ?label |
|
|
ORDER BY DESC(?score) |
|
|
""" |
|
|
|
|
|
|
|
|
url = f"https://services.data.bioontology.org/annotatorplus/?text={word.lower()}&ontologies=AEO,BFO,BIM,BCGO,CL,CHIRO,CHEBI,DCM,FMA,GO,GENO,GEOSPARQL,HL7,DOID,HP,HP_O,IDO,IAO,ICD10,LOINC,MESH,MONDO,NCIT,NCBITAXON,NIFCELL,NIFSTD,GML,OBCS,OCHV,OHPI,OPB,PTRANS,PLOSTHES,RADLEX,OBOREL,STY,SO,SNOMEDCT,STATO,SYMP,FOODON,UBERON,VO&longest_only=true&exclude_numbers=true&whole_word_only=true&exclude_synonyms=false&negation=false&experiencer=false&temporality=false&score_threshold=0&confidence_threshold=0&display_links=false&display_context=false&score=cvalue&apikey={key_bioportal}" |
|
|
|
|
|
try: |
|
|
|
|
|
if UseBioportalForLinking == False: |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("Use Virtuoso Sparql endpoint for linking ... " + word.lower()) |
|
|
|
|
|
|
|
|
responseText = sparqlQuery(endpoint, query, VirtuosoUsername, key_virtuoso, |
|
|
strtobool(args.USE_CACHE)) |
|
|
|
|
|
|
|
|
results = json.loads(responseText) |
|
|
|
|
|
if len(results) > 0 and results['results']['bindings']: |
|
|
|
|
|
entityBioeUrl = str(results['results']['bindings'][0]['concept']['value']) |
|
|
|
|
|
if cache_map_virtuoso is not None: |
|
|
if not word in cache_map_virtuoso: |
|
|
cache_map_virtuoso[word] = {} |
|
|
cache_map_virtuoso[word][contextWordVirtuoso] = entityBioeUrl |
|
|
|
|
|
|
|
|
for result in results['results']['bindings']: |
|
|
|
|
|
|
|
|
contextConcept = result['concept']['value'] |
|
|
if contextConcept not in ALLURIScontext: |
|
|
ALLURIScontext.append(contextConcept) |
|
|
if cache_map_virtuoso is not None: |
|
|
if not word in cache_map_virtuoso: |
|
|
cache_map_virtuoso[word] = {} |
|
|
cache_map_virtuoso[word][contextConcept] = None |
|
|
|
|
|
if ALLURIScontext and isinstance(ALLURIScontext, list): |
|
|
ALLURIScontext = list(set(ALLURIScontext)) |
|
|
|
|
|
if cache_map_virtuoso is not None: |
|
|
if not word in cache_map_virtuoso: |
|
|
cache_map_virtuoso[word] = {} |
|
|
cache_map_virtuoso[word]['ALLURIScontext'] = ALLURIScontext |
|
|
|
|
|
else: |
|
|
|
|
|
if cache_map_virtuoso is not None: |
|
|
if not word in cache_map_virtuoso: |
|
|
cache_map_virtuoso[word] = {} |
|
|
cache_map_virtuoso[word][contextWordVirtuoso] = None |
|
|
cache_map_virtuoso[word]['ALLURIScontext'] = [] |
|
|
|
|
|
else: |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("Use Bioportal for linking ... " + word.lower()) |
|
|
|
|
|
response = requests.get(url) |
|
|
|
|
|
try: |
|
|
data = response.json() |
|
|
|
|
|
if not data: |
|
|
|
|
|
|
|
|
return None, None, cache_map_virtuoso |
|
|
|
|
|
dff = pd.DataFrame(data) |
|
|
dff = dff.drop(columns=['hierarchy', 'mappings']) |
|
|
|
|
|
|
|
|
expanded_annotated_class = pd.json_normalize(dff['annotatedClass']) |
|
|
|
|
|
expanded_annotations = pd.DataFrame(dff['annotations'].tolist(), index=dff.index) |
|
|
expanded_annotations = pd.json_normalize(expanded_annotations[0]) |
|
|
|
|
|
|
|
|
df_expanded = dff.drop(columns=['annotatedClass', 'annotations']).join(expanded_annotated_class).join( |
|
|
expanded_annotations) |
|
|
|
|
|
|
|
|
df_expanded['@id'] = df_expanded['@id'].str.replace( |
|
|
"http://purl.bioontology.org/ontology/SNOMEDCT/", |
|
|
"http://snomed.info/id/" |
|
|
) |
|
|
|
|
|
if not df_expanded.empty: |
|
|
|
|
|
df_expanded = df_expanded.sort_values( |
|
|
by=['from', 'text', 'score', 'matchType'], ascending=[True, True, False, False]) |
|
|
|
|
|
df_expanded = df_expanded.drop_duplicates(subset=['@id']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if 'prefLabel' in df_expanded.columns: |
|
|
if 'synonym' in df_expanded.columns: |
|
|
df_expanded = df_expanded[ |
|
|
df_expanded['prefLabel'].apply( |
|
|
lambda x: isinstance(x, str) and x.lower() == word.lower() |
|
|
) | |
|
|
df_expanded['synonym'].apply( |
|
|
lambda x: isinstance(x, list) and any(item.lower() == word.lower() for item in x) |
|
|
) |
|
|
] |
|
|
else: |
|
|
df_expanded = df_expanded[ |
|
|
df_expanded['prefLabel'].apply( |
|
|
lambda x: isinstance(x, str) and x.lower() == word.lower() |
|
|
) |
|
|
] |
|
|
|
|
|
if df_expanded.empty: |
|
|
|
|
|
|
|
|
return None, None, cache_map_virtuoso |
|
|
|
|
|
|
|
|
columns_to_keep = ["score", "from", "to", "prefLabel", "text", "@id"] |
|
|
|
|
|
|
|
|
df_expanded = df_expanded[columns_to_keep] |
|
|
|
|
|
|
|
|
df_expanded = df_expanded.rename( |
|
|
columns={"from": "start", "to": "end", "text": "word"}) |
|
|
|
|
|
|
|
|
df_expanded = df_expanded.reset_index(drop=True) |
|
|
|
|
|
df_expanded['score'] = df_expanded['score'].round(2) |
|
|
|
|
|
|
|
|
max_score_index = df_expanded['score'].idxmax() |
|
|
|
|
|
max_score_row = df_expanded.loc[df_expanded['score'].idxmax()] |
|
|
|
|
|
entityBioeUrl = str(max_score_row['@id']) |
|
|
|
|
|
if cache_map_virtuoso is not None: |
|
|
if not word in cache_map_virtuoso: |
|
|
cache_map_virtuoso[word] = {} |
|
|
cache_map_virtuoso[word][contextWordVirtuoso] = entityBioeUrl |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df_expanded.reset_index(drop=True, inplace=True) |
|
|
|
|
|
|
|
|
for index, row in df_expanded.iterrows(): |
|
|
|
|
|
if row['@id'] is not None and pd.notna(row['@id']): |
|
|
contextConcept=row['@id'] |
|
|
ALLURIScontext.append(contextConcept) |
|
|
if cache_map_virtuoso is not None: |
|
|
if not word in cache_map_virtuoso: |
|
|
cache_map_virtuoso[word] = {} |
|
|
cache_map_virtuoso[word][contextConcept] = None |
|
|
|
|
|
if ALLURIScontext and isinstance(ALLURIScontext, list): |
|
|
ALLURIScontext = list(set(ALLURIScontext)) |
|
|
|
|
|
if cache_map_virtuoso is not None: |
|
|
if not word in cache_map_virtuoso: |
|
|
cache_map_virtuoso[word] = {} |
|
|
cache_map_virtuoso[word]['ALLURIScontext'] = ALLURIScontext |
|
|
|
|
|
|
|
|
return entityBioeUrl, ALLURIScontext, cache_map_virtuoso |
|
|
|
|
|
else: |
|
|
|
|
|
|
|
|
return None, None, cache_map_virtuoso |
|
|
|
|
|
|
|
|
except Exception as err: |
|
|
logging.error( |
|
|
f'ERROR ON BioPortal Annotator API Call\n\tError: {err}\n TextToAnnotate: {word.lower()}\n Have a check...') |
|
|
|
|
|
return None, None, cache_map_virtuoso |
|
|
|
|
|
except Exception as err: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return None, None, cache_map_virtuoso |
|
|
|
|
|
|
|
|
return entityBioeUrl, ALLURIScontext, cache_map_virtuoso |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=None, iALLURIScontextFromNCBO=None,UseBioportalForLinking=True,questionText=""): |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print(f"\n----- Starting virtuoso_api_call for {word}") |
|
|
|
|
|
word = word.lower() |
|
|
word = strip_quotes(word) |
|
|
|
|
|
endpoint = 'https://api-vast.jrc.service.ec.europa.eu/sparql' |
|
|
VirtuosoUsername = 'dba' |
|
|
|
|
|
if getattr(args, 'KG_restriction', None): |
|
|
contextWordVirtuoso = ', '.join(sorted(args.KG_restriction)) |
|
|
else: |
|
|
contextWordVirtuoso = "" |
|
|
|
|
|
singleContext = None |
|
|
globalContext = None |
|
|
sssingleTriples = None |
|
|
ggglobalTriples = None |
|
|
unique_listLabelTriples = [] |
|
|
unique_listGlobalTriples = [] |
|
|
|
|
|
ALLURIScontext = [] |
|
|
|
|
|
url_text = None |
|
|
if id: |
|
|
url_text = id |
|
|
|
|
|
if iALLURIScontextFromNCBO and isinstance(iALLURIScontextFromNCBO, list): |
|
|
ALLURIScontext=iALLURIScontextFromNCBO |
|
|
ALLURIScontext = list(set(ALLURIScontext)) |
|
|
|
|
|
if (cache_map_virtuoso is not None) and (not url_text): |
|
|
if word in cache_map_virtuoso: |
|
|
if contextWordVirtuoso in cache_map_virtuoso[word]: |
|
|
url_text = cache_map_virtuoso[word][contextWordVirtuoso] |
|
|
if strtobool(args.debug): |
|
|
print("RETRIEVED CACHED RESULT FOR:\n", word, " => ", url_text, "\n") |
|
|
if not url_text: |
|
|
return None, None, None, None, None, None, cache_map_virtuoso, load_map_query_input_output |
|
|
|
|
|
if url_text and not ALLURIScontext: |
|
|
if cache_map_virtuoso is not None: |
|
|
if word in cache_map_virtuoso: |
|
|
if 'ALLURIScontext' in cache_map_virtuoso[word]: |
|
|
ALLURIScontext = cache_map_virtuoso[word]['ALLURIScontext'] |
|
|
|
|
|
entityBioeUrl = None |
|
|
if url_text and ALLURIScontext: |
|
|
entityBioeUrl = url_text |
|
|
|
|
|
else: |
|
|
|
|
|
try: |
|
|
entityBioeUrl, ALLURIScontext, cache_map_virtuoso = getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=UseBioportalForLinking, questionText=questionText ) |
|
|
if ALLURIScontext and isinstance(ALLURIScontext, list): |
|
|
ALLURIScontext = list(set(ALLURIScontext)) |
|
|
except Exception as err: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return None, None, None, None, None, None, cache_map_virtuoso, load_map_query_input_output |
|
|
|
|
|
|
|
|
|
|
|
if entityBioeUrl: |
|
|
|
|
|
if strtobool(args.computeEntityContext) and (strtobool(args.computeEntityGlobalContext)==False): |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("START computeEntityContext") |
|
|
|
|
|
unique_listLabelTriples = [] |
|
|
singleContext = None |
|
|
|
|
|
if cache_map_virtuoso is not None: |
|
|
if entityBioeUrl in cache_map_virtuoso: |
|
|
if "LabelTriples" in cache_map_virtuoso[entityBioeUrl]: |
|
|
unique_listLabelTriples = cache_map_virtuoso[entityBioeUrl]["LabelTriples"] |
|
|
if strtobool(args.debug): |
|
|
print("RETRIEVED CACHED RESULT FOR:\n", entityBioeUrl, " => ", "LabelTriples", "\n") |
|
|
if ("SingleContext" in cache_map_virtuoso[entityBioeUrl]) and (strtobool(args.UseRetrieverForContextCreation)==False): |
|
|
singleContext = cache_map_virtuoso[entityBioeUrl]["SingleContext"] |
|
|
if strtobool(args.debug): |
|
|
print("RETRIEVED CACHED RESULT FOR:\n", entityBioeUrl, " => ", "SingleContext", "\n") |
|
|
|
|
|
|
|
|
if not singleContext: |
|
|
if unique_listLabelTriples: |
|
|
singleContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, unique_listLabelTriples, |
|
|
text_splitter, args, |
|
|
load_map_query_input_output,cleanInput=True,questionText=questionText) |
|
|
else: |
|
|
|
|
|
query = f""" |
|
|
prefix skosxl: <http://www.w3.org/2008/05/skos-xl#> |
|
|
SELECT DISTINCT ?labelS ?labelP ?labelO |
|
|
WHERE {{ |
|
|
{{ |
|
|
<{entityBioeUrl}> ?p ?o. |
|
|
<{entityBioeUrl}> skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelS . |
|
|
?p skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelP . |
|
|
?o skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelO . |
|
|
}} |
|
|
UNION |
|
|
{{ |
|
|
SELECT ?labelS ?labelP ?labelO |
|
|
WHERE {{ |
|
|
<{entityBioeUrl}> ?p ?labelO . |
|
|
<{entityBioeUrl}> skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelS . |
|
|
?p skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelP . |
|
|
FILTER (isLiteral(?labelO)) |
|
|
}} |
|
|
}} |
|
|
UNION |
|
|
{{ |
|
|
SELECT DISTINCT ?labelS ?labelP ?labelO |
|
|
WHERE {{ |
|
|
<{entityBioeUrl}> ?ppp ?ooo . |
|
|
?ooo rdf:type owl:Restriction . |
|
|
?ooo owl:onProperty ?p . |
|
|
?ooo owl:someValuesFrom ?o . |
|
|
<{entityBioeUrl}> skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelS . |
|
|
?p skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelP . |
|
|
?o skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelO . |
|
|
}} |
|
|
}} |
|
|
}} |
|
|
""" |
|
|
|
|
|
try: |
|
|
responseText = sparqlQuery(endpoint, query, VirtuosoUsername, key_virtuoso, strtobool(args.USE_CACHE)) |
|
|
|
|
|
|
|
|
results = json.loads(responseText) |
|
|
|
|
|
if len(results) > 0 and results['results']['bindings']: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
labelTriples="" |
|
|
listLabelTriples = [] |
|
|
pattern = r'\^\^<http:.*?>' |
|
|
for result in results['results']['bindings']: |
|
|
|
|
|
|
|
|
ss = str(result['labelS']['value']).strip().replace("..",".").replace("@en","") |
|
|
ss = re.sub(pattern, '', ss) |
|
|
pp = split_camel_case(str(result['labelP']['value'])).replace("_"," ").strip().replace("..",".").replace("@en","") |
|
|
pp = re.sub(pattern, '', pp) |
|
|
oo = str(result['labelO']['value']).replace("_"," ").strip().replace("..",".").replace("@en","") |
|
|
oo = re.sub(pattern, '', oo) |
|
|
listLabelTriples.append([ss, pp, oo]) |
|
|
|
|
|
|
|
|
unique_listLabelTriples = list(dict.fromkeys(tuple(triple) for triple in listLabelTriples)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if unique_listLabelTriples: |
|
|
if cache_map_virtuoso is not None: |
|
|
if not entityBioeUrl in cache_map_virtuoso: |
|
|
cache_map_virtuoso[entityBioeUrl] = {} |
|
|
cache_map_virtuoso[entityBioeUrl]["LabelTriples"] = unique_listLabelTriples |
|
|
|
|
|
singleContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, unique_listLabelTriples, text_splitter, args, load_map_query_input_output,cleanInput=True,questionText=questionText) |
|
|
|
|
|
|
|
|
except Exception as err: |
|
|
singleContext = None |
|
|
|
|
|
if singleContext and (strtobool(args.UseRetrieverForContextCreation)==False): |
|
|
if cache_map_virtuoso is not None: |
|
|
if not entityBioeUrl in cache_map_virtuoso: |
|
|
cache_map_virtuoso[entityBioeUrl] = {} |
|
|
cache_map_virtuoso[entityBioeUrl]["SingleContext"] = singleContext |
|
|
|
|
|
|
|
|
if strtobool(args.computeEntityGlobalContext): |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print("START computeEntityGlobalContext") |
|
|
|
|
|
unique_listGlobalTriples = [] |
|
|
globalContext = None |
|
|
|
|
|
if cache_map_virtuoso is not None: |
|
|
if word in cache_map_virtuoso: |
|
|
if ("GlobalTriples"+" "+contextWordVirtuoso).strip() in cache_map_virtuoso[word]: |
|
|
unique_listGlobalTriples = cache_map_virtuoso[word][("GlobalTriples"+" "+contextWordVirtuoso).strip()] |
|
|
if strtobool(args.debug): |
|
|
print("RETRIEVED CACHED RESULT FOR:\n", word, " => ", ("GlobalTriples"+" "+contextWordVirtuoso).strip(), "\n") |
|
|
if (("GlobalContext"+" "+contextWordVirtuoso).strip() in cache_map_virtuoso[word]) and (strtobool(args.UseRetrieverForContextCreation)==False): |
|
|
globalContext = cache_map_virtuoso[word][("GlobalContext"+" "+contextWordVirtuoso).strip()] |
|
|
if strtobool(args.debug): |
|
|
print("RETRIEVED CACHED RESULT FOR:\n", word, " => ", ("GlobalContext"+" "+contextWordVirtuoso).strip(), "\n") |
|
|
|
|
|
|
|
|
if not globalContext: |
|
|
|
|
|
BreakenBeforeAll = False |
|
|
if unique_listGlobalTriples: |
|
|
globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, unique_listGlobalTriples, |
|
|
text_splitter, args, |
|
|
load_map_query_input_output,cleanInput=True,questionText=questionText) |
|
|
else: |
|
|
|
|
|
if not ALLURIScontext: |
|
|
if cache_map_virtuoso is not None: |
|
|
if word in cache_map_virtuoso: |
|
|
ALLURIScontext = list(cache_map_virtuoso[word].keys()) |
|
|
ALLURIScontext = [element for element in ALLURIScontext if element and ("GlobalTriples" in element == False) and ("GlobalContext" in element == False) and "http" in element ] |
|
|
|
|
|
if not ALLURIScontext: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
entityBioeUrl, ALLURIScontext, cache_map_virtuoso = getUrlBioAndAllOtherBioConcepts(word, |
|
|
args, |
|
|
key_virtuoso, |
|
|
cache_map_virtuoso, |
|
|
endpoint, |
|
|
VirtuosoUsername, |
|
|
contextWordVirtuoso, |
|
|
UseBioportalForLinking=UseBioportalForLinking, |
|
|
questionText=questionText) |
|
|
if ALLURIScontext and isinstance(ALLURIScontext, list): |
|
|
ALLURIScontext = list(set(ALLURIScontext)) |
|
|
|
|
|
except Exception as err: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return None, None, None, None, None, None, cache_map_virtuoso, load_map_query_input_output |
|
|
|
|
|
|
|
|
|
|
|
if not ALLURIScontext: |
|
|
|
|
|
print("THIS CASE SHOULD NEVER HAPPEN NOW!!!! Check what's happening...exiting now...") |
|
|
|
|
|
sys.exit(1) |
|
|
|
|
|
else: |
|
|
|
|
|
for xxUrl in ALLURIScontext: |
|
|
|
|
|
unique_listLabelTriples = [] |
|
|
|
|
|
|
|
|
if cache_map_virtuoso is not None: |
|
|
if xxUrl in cache_map_virtuoso: |
|
|
if "LabelTriples" in cache_map_virtuoso[xxUrl]: |
|
|
unique_listLabelTriples = cache_map_virtuoso[xxUrl]["LabelTriples"] |
|
|
if strtobool(args.debug): |
|
|
print("RETRIEVED CACHED RESULT FOR:\n", xxUrl, " => ", |
|
|
"LabelTriples", "\n") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not unique_listLabelTriples: |
|
|
|
|
|
query = f""" |
|
|
prefix skosxl: <http://www.w3.org/2008/05/skos-xl#> |
|
|
SELECT DISTINCT ?labelS ?labelP ?labelO |
|
|
WHERE {{ |
|
|
{{ |
|
|
<{xxUrl}> ?p ?o. |
|
|
<{xxUrl}> skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelS . |
|
|
?p skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelP . |
|
|
?o skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelO . |
|
|
}} |
|
|
UNION |
|
|
{{ |
|
|
SELECT ?labelS ?labelP ?labelO |
|
|
WHERE {{ |
|
|
<{xxUrl}> ?p ?labelO . |
|
|
<{xxUrl}> skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelS . |
|
|
?p skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelP . |
|
|
FILTER (isLiteral(?labelO)) |
|
|
}} |
|
|
}} |
|
|
UNION |
|
|
{{ |
|
|
SELECT DISTINCT ?labelS ?labelP ?labelO |
|
|
WHERE {{ |
|
|
<{xxUrl}> ?ppp ?ooo . |
|
|
?ooo rdf:type owl:Restriction . |
|
|
?ooo owl:onProperty ?p . |
|
|
?ooo owl:someValuesFrom ?o . |
|
|
<{xxUrl}> skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelS . |
|
|
?p skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelP . |
|
|
?o skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?labelO . |
|
|
}} |
|
|
}} |
|
|
}} |
|
|
""" |
|
|
|
|
|
try: |
|
|
responseText = sparqlQuery(endpoint, query, VirtuosoUsername, key_virtuoso, strtobool(args.USE_CACHE)) |
|
|
|
|
|
|
|
|
results = json.loads(responseText) |
|
|
|
|
|
if len(results) > 0 and results['results']['bindings']: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
labelTriples = "" |
|
|
listLabelTriples = [] |
|
|
pattern = r'\^\^<http:.*?>' |
|
|
for result in results['results']['bindings']: |
|
|
|
|
|
|
|
|
ss = str(result['labelS']['value']).strip().replace("..", ".").replace("@en","") |
|
|
ss = re.sub(pattern, '', ss) |
|
|
pp = split_camel_case(str(result['labelP']['value'])).replace("_"," ").strip().replace("..", ".").replace("@en","") |
|
|
pp = re.sub(pattern, '', pp) |
|
|
oo = str(result['labelO']['value']).replace("_"," ").strip().replace("..", ".").replace("@en","") |
|
|
oo = re.sub(pattern, '', oo) |
|
|
listLabelTriples.append([ss, pp, oo]) |
|
|
|
|
|
|
|
|
unique_listLabelTriples = list( |
|
|
dict.fromkeys(tuple(triple) for triple in listLabelTriples)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if unique_listLabelTriples: |
|
|
if cache_map_virtuoso is not None: |
|
|
if not xxUrl in cache_map_virtuoso: |
|
|
cache_map_virtuoso[xxUrl] = {} |
|
|
cache_map_virtuoso[xxUrl][ |
|
|
"LabelTriples"] = unique_listLabelTriples |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as err: |
|
|
unique_listLabelTriples = [] |
|
|
|
|
|
|
|
|
if unique_listLabelTriples: |
|
|
unique_listGlobalTriples.extend(unique_listLabelTriples) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if getattr(args, 'maxTriplesContextComputation', None): |
|
|
if args.maxTriplesContextComputation > 0: |
|
|
if len(unique_listGlobalTriples) > args.maxTriplesContextComputation: |
|
|
unique_listGlobalTriples = unique_listGlobalTriples[:args.maxTriplesContextComputation] |
|
|
BreakenBeforeAll = True |
|
|
break |
|
|
|
|
|
|
|
|
if unique_listGlobalTriples: |
|
|
|
|
|
unique_listGlobalTriples = list( |
|
|
dict.fromkeys(tuple(triple) for triple in unique_listGlobalTriples)) |
|
|
|
|
|
if cache_map_virtuoso is not None: |
|
|
if not word in cache_map_virtuoso: |
|
|
cache_map_virtuoso[word] = {} |
|
|
if BreakenBeforeAll == False: |
|
|
cache_map_virtuoso[word][("GlobalTriples"+" "+contextWordVirtuoso).strip()] = unique_listGlobalTriples |
|
|
|
|
|
globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, |
|
|
unique_listGlobalTriples, |
|
|
text_splitter, args, |
|
|
load_map_query_input_output, cleanInput=True, questionText=questionText) |
|
|
|
|
|
if globalContext and (strtobool(args.UseRetrieverForContextCreation)==False): |
|
|
if cache_map_virtuoso is not None: |
|
|
if not word in cache_map_virtuoso: |
|
|
cache_map_virtuoso[word] = {} |
|
|
if BreakenBeforeAll == False: |
|
|
cache_map_virtuoso[word][("GlobalContext"+" "+contextWordVirtuoso).strip()] = globalContext |
|
|
|
|
|
if unique_listLabelTriples: |
|
|
sssingleTriples = " ,., ".join( |
|
|
" ,,, ".join(str(element).capitalize() for element in triple) for triple in unique_listLabelTriples) |
|
|
while "\\n" in sssingleTriples: |
|
|
sssingleTriples = sssingleTriples.replace("\\n", " ") |
|
|
sssingleTriples = sssingleTriples.strip() |
|
|
while "\t" in sssingleTriples: |
|
|
sssingleTriples = sssingleTriples.replace("\t", " ") |
|
|
sssingleTriples = sssingleTriples.strip() |
|
|
|
|
|
if unique_listGlobalTriples: |
|
|
ggglobalTriples = " ,., ".join( |
|
|
" ,,, ".join(str(element).capitalize() for element in triple) for triple in unique_listGlobalTriples) |
|
|
while "\\n" in ggglobalTriples: |
|
|
ggglobalTriples = ggglobalTriples.replace("\\n", " ") |
|
|
ggglobalTriples = ggglobalTriples.strip() |
|
|
while "\t" in ggglobalTriples: |
|
|
ggglobalTriples = ggglobalTriples.replace("\t", " ") |
|
|
ggglobalTriples = ggglobalTriples.strip() |
|
|
|
|
|
return entityBioeUrl, ALLURIScontext, singleContext, globalContext, sssingleTriples, ggglobalTriples, cache_map_virtuoso, load_map_query_input_output |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output): |
|
|
|
|
|
result = "" |
|
|
singleContext = "" |
|
|
globalContext = "" |
|
|
singleTriples = "" |
|
|
globalTriples = "" |
|
|
ALLURIScontext = [] |
|
|
|
|
|
try: |
|
|
|
|
|
if row.empty: |
|
|
return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name |
|
|
|
|
|
InRagMode=False |
|
|
if hasattr(args, 'useBioKgRAG') and (strtobool(args.useBioKgRAG)==True): |
|
|
InRagMode = True |
|
|
|
|
|
if (InRagMode==False): |
|
|
if row['IsGeo'] == 1: |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print(f"\n----- IsGeo ... COMPUTING {row['word']} IN THE TEXT:") |
|
|
print(row[args.source_column]) |
|
|
|
|
|
result, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames = geonames_api_call(row['word'], args, key_geonames, cache_map_geonames) |
|
|
|
|
|
elif row['IsBio'] == 1: |
|
|
|
|
|
|
|
|
iiid = None |
|
|
|
|
|
if '@id' in row: |
|
|
|
|
|
if row['@id'] is not None and not pd.isna(row['@id']): |
|
|
|
|
|
iiid = row['@id'] |
|
|
iiiALLURIScontextFromNCBO = None |
|
|
if 'ALLURIScontextFromNCBO' in row: |
|
|
if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'], list): |
|
|
iiiALLURIScontextFromNCBO=row['ALLURIScontextFromNCBO'] |
|
|
iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO)) |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print(f"\n----- isBio COMPUTING ... {row['word']} IN THE TEXT:") |
|
|
print(row[args.source_column]) |
|
|
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=True, questionText=row[args.source_column]) |
|
|
|
|
|
else: |
|
|
if row['model'] == "Forced": |
|
|
|
|
|
iiid = None |
|
|
|
|
|
if '@id' in row: |
|
|
|
|
|
if row['@id'] is not None and not pd.isna(row['@id']): |
|
|
|
|
|
iiid = row['@id'] |
|
|
iiiALLURIScontextFromNCBO = None |
|
|
if 'ALLURIScontextFromNCBO' in row: |
|
|
if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'], |
|
|
list): |
|
|
iiiALLURIScontextFromNCBO = row['ALLURIScontextFromNCBO'] |
|
|
iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO)) |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print(f"\n----- isForced COMPUTING ... {row['word']} IN THE TEXT:") |
|
|
print(row[args.source_column]) |
|
|
|
|
|
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call( |
|
|
row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, |
|
|
id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True,questionText=row[args.source_column]) |
|
|
|
|
|
if not result: |
|
|
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call( |
|
|
row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, |
|
|
id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=False,questionText=row[args.source_column]) |
|
|
|
|
|
else: |
|
|
|
|
|
if '@id' in row: |
|
|
|
|
|
if row['@id'] is not None and not pd.isna(row['@id']): |
|
|
|
|
|
iiid = row['@id'] |
|
|
|
|
|
iiiALLURIScontextFromNCBO = None |
|
|
if 'ALLURIScontextFromNCBO' in row: |
|
|
if row['ALLURIScontextFromNCBO'] is not None and isinstance( |
|
|
row['ALLURIScontextFromNCBO'], |
|
|
list): |
|
|
iiiALLURIScontextFromNCBO = row['ALLURIScontextFromNCBO'] |
|
|
iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO)) |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print(f"\n----- It is not IsBio or IsGeo, but it has id from NCBO ...forcing COMPUTING ... {row['word']} IN THE TEXT:") |
|
|
print(row[args.source_column]) |
|
|
|
|
|
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call( |
|
|
row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, |
|
|
load_map_query_input_output, |
|
|
id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=True, |
|
|
questionText=row[args.source_column]) |
|
|
|
|
|
|
|
|
else: |
|
|
if (row['IsBio'] == 1) or ( (pd.isnull(row["IsBio"]) or row["IsBio"] == '' or row['IsBio'] == 0 or row["IsBio"] is None) and (row['entity_group'] == "MISC") ): |
|
|
|
|
|
if strtobool(args.debug): |
|
|
print(f"\n----- InRagMode ...COMPUTING ... {row['word']} IN THE TEXT:") |
|
|
print(row[args.source_column]) |
|
|
|
|
|
|
|
|
iiid = None |
|
|
|
|
|
if '@id' in row: |
|
|
|
|
|
if row['@id'] is not None and not pd.isna(row['@id']): |
|
|
|
|
|
iiid = row['@id'] |
|
|
iiiALLURIScontextFromNCBO = None |
|
|
if 'ALLURIScontextFromNCBO' in row: |
|
|
if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'], list): |
|
|
iiiALLURIScontextFromNCBO = row['ALLURIScontextFromNCBO'] |
|
|
iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO)) |
|
|
|
|
|
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call( |
|
|
row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True,questionText=row[args.source_column]) |
|
|
|
|
|
return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name |
|
|
|
|
|
|
|
|
def parallel_process_Row4Linking(df, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output): |
|
|
results = [] |
|
|
|
|
|
with ThreadPoolExecutor(max_workers=args.num_cores_eLinking) as executor: |
|
|
|
|
|
futures = [executor.submit(process_row4Linking, row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output) |
|
|
for _, row in df.iterrows()] |
|
|
|
|
|
|
|
|
for future in as_completed(futures): |
|
|
try: |
|
|
|
|
|
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames_Inner, cache_map_virtuoso_Inner, load_map_query_input_output_Inner, drm_idx = future.result() |
|
|
|
|
|
df.at[drm_idx,'namedEntity'] = result |
|
|
df.at[drm_idx, 'ALLURIScontext'] = ALLURIScontext |
|
|
df.at[drm_idx,'Context'] = singleContext |
|
|
df.at[drm_idx,'ContextGlobal'] = globalContext |
|
|
df.at[drm_idx, 'Triples'] = singleTriples |
|
|
df.at[drm_idx, 'TriplesGlobal'] = globalTriples |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (cache_map_geonames == cache_map_geonames_Inner)==False: |
|
|
update_nested_dict(cache_map_geonames, cache_map_geonames_Inner) |
|
|
if (cache_map_virtuoso == cache_map_virtuoso_Inner) == False: |
|
|
update_nested_dict(cache_map_virtuoso, cache_map_virtuoso_Inner) |
|
|
if (load_map_query_input_output == load_map_query_input_output_Inner) == False: |
|
|
update_nested_dict(load_map_query_input_output, load_map_query_input_output_Inner) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error occurred: {e}") |
|
|
|
|
|
return df, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output |
|
|
|
|
|
|
|
|
def elinking(df_annotated_combined, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, device): |
|
|
|
|
|
if "ALLURIScontext" not in df_annotated_combined.columns: |
|
|
df_annotated_combined["ALLURIScontext"] = None |
|
|
|
|
|
if args.num_cores_eLinking>1: |
|
|
|
|
|
|
|
|
df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = parallel_process_Row4Linking(df_annotated_combined, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
result = df_annotated_combined.apply(lambda row: process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output), axis=1) |
|
|
|
|
|
|
|
|
try: |
|
|
df_annotated_combined['namedEntity'] = result.str[0] |
|
|
df_annotated_combined['ALLURIScontext'] = result.str[1] |
|
|
df_annotated_combined['Context'] = result.str[2] |
|
|
df_annotated_combined['ContextGlobal'] = result.str[3] |
|
|
df_annotated_combined['Triples'] = result.str[4] |
|
|
df_annotated_combined['TriplesGlobal'] = result.str[5] |
|
|
cache_map_geonames_AFTER = result.str[6].iloc[-1] |
|
|
cache_map_virtuoso_AFTER = result.str[7].iloc[-1] |
|
|
load_map_query_input_output_AFTER = result.str[8].iloc[-1] |
|
|
except Exception as e: |
|
|
|
|
|
df_annotated_combined['namedEntity'] = "" |
|
|
df_annotated_combined['ALLURIScontext'] = "" |
|
|
df_annotated_combined['Context'] = "" |
|
|
df_annotated_combined['ContextGlobal'] = "" |
|
|
df_annotated_combined['Triples'] = "" |
|
|
df_annotated_combined['TriplesGlobal'] = "" |
|
|
cache_map_geonames_AFTER = cache_map_geonames |
|
|
cache_map_virtuoso_AFTER = cache_map_virtuoso |
|
|
load_map_query_input_output_AFTER = load_map_query_input_output |
|
|
|
|
|
|
|
|
def fill_alluriscontext(row): |
|
|
if not row['ALLURIScontext'] and pd.notnull(row['namedEntity']): |
|
|
return [row['namedEntity']] |
|
|
return row['ALLURIScontext'] |
|
|
|
|
|
|
|
|
df_annotated_combined['ALLURIScontext'] = df_annotated_combined.apply(fill_alluriscontext, axis=1) |
|
|
|
|
|
|
|
|
if args.num_cores_eLinking>1: |
|
|
|
|
|
df_annotated_combined = df_annotated_combined.sort_values(by=['SentenceRef', 'start', 'ToLink', 'word', 'score'], |
|
|
ascending=[True, True, True, True, False]) |
|
|
|
|
|
return df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start = time.time() |
|
|
|
|
|
encod = encoding_getter('microsoft/deberta-v3-large') |
|
|
text_splitter = TokenTextSplitter( |
|
|
|
|
|
encoding_name=encod.name, |
|
|
chunk_size=80000, |
|
|
chunk_overlap=50, |
|
|
length_function=len, |
|
|
add_start_index=True, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models_List = ["NCBO/BioPortal"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df_annotated = pd.DataFrame() |
|
|
for model_id in models_List: |
|
|
|
|
|
parser = argparse.ArgumentParser() |
|
|
|
|
|
parser.add_argument("--model_id", type=str, default=model_id, help="model to use") |
|
|
parser.add_argument("--debug", type=str, default="True", help="set debug mode") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument("--input_dir", type=str, |
|
|
default="/eos/jeodpp/home/users/consose/PycharmProjects/ner-virtuoso/") |
|
|
parser.add_argument("--filename", type=str, |
|
|
default="ToAnnotateTrial2.csv") |
|
|
parser.add_argument("--source_column", type=str, default="ContextToAnnotate") |
|
|
|
|
|
parser.add_argument("--entities_filter_threshold", type=int, default=0.7) |
|
|
|
|
|
parser.add_argument("--SEED", type=int, default=41) |
|
|
parser.add_argument("--batch_size", type=int, default=4) |
|
|
parser.add_argument("--num_cores_Gliner", type=int, default=0, help="parallel processing for Gliner annotation") |
|
|
|
|
|
parser.add_argument("--entity_linking", type=str, default="True", help="whether to make entities linking or not") |
|
|
parser.add_argument("--geonameskey_filename", type=str, default="GEONAMES-API.key", help="file location where it is stored the geonames api key") |
|
|
parser.add_argument("--virtuosokey_filename", type=str, default="VIRTUOSO-dba.key", help="file location where it is stored the virtuoso endpoint dba pwd") |
|
|
parser.add_argument("--bioportalkey_filename", type=str, default="NCBO-BioPortal.key", help="file location where it is stored the NCBO BioPortal api key") |
|
|
|
|
|
|
|
|
|
|
|
KGchoices = POSSIBLE_KGchoices_List |
|
|
|
|
|
|
|
|
if KGchoices: |
|
|
KGchoices.sort() |
|
|
parser.add_argument("--KG_restriction", nargs='+', choices=KGchoices, default=KGchoices, |
|
|
help="List of ontologies to which restrict the entity linking task.") |
|
|
|
|
|
|
|
|
if Counter(KGchoices) == Counter(POSSIBLE_KGchoices_List): |
|
|
parser.add_argument("--USE_CACHE", type=str, default="True", |
|
|
help="whether to use cache for the NER and NEL tasks or not") |
|
|
else: |
|
|
|
|
|
parser.add_argument("--USE_CACHE", type=str, default="False", |
|
|
help="whether to use cache for the NER and NEL tasks or not") |
|
|
|
|
|
parser.add_argument("--num_cores_eLinking", type=int, default=4, help="parallel processing for the entity linking process") |
|
|
|
|
|
parser.add_argument("--computeEntityContext", type=str, default="False", help="whether to extract a readable context from the extracted triples for the concept") |
|
|
parser.add_argument("--computeEntityGlobalContext", type=str, default="False", help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept") |
|
|
parser.add_argument("--maxTriplesContextComputation", type=int, default=20000, |
|
|
help="maximum number of triples to consider for global context computation") |
|
|
parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True", |
|
|
help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs") |
|
|
|
|
|
parser.add_argument("--service_provider", type=str, default="gptjrc", help="llm service provider") |
|
|
parser.add_argument("--model_name", type=str, default="llama-3.1-70b-instruct-fp8", help="llm to use") |
|
|
parser.add_argument("--tokens_max", type=int, default=80000, help="max number of tokens to supply to the llm") |
|
|
|
|
|
parser.add_argument("--max_new_tokens", type=int, default=4096, help="max number of tokens for the output of the LLM in premises") |
|
|
|
|
|
parser.add_argument("--temperature", type=int, default=0.01) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
print("ARGS:") |
|
|
print(args) |
|
|
|
|
|
|
|
|
|
|
|
random.seed(args.SEED) |
|
|
np.random.seed(args.SEED) |
|
|
torch.manual_seed(args.SEED) |
|
|
torch.cuda.manual_seed_all(args.SEED) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
in_filename = args.input_dir + args.filename |
|
|
df_ToAnnotate = pd.read_csv(in_filename, sep=',', header=0, dtype=str, encoding='utf-8') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if "ToLink" not in df_ToAnnotate.columns: |
|
|
df_ToAnnotate["ToLink"] = None |
|
|
|
|
|
if "SentenceRef" not in df_ToAnnotate.columns: |
|
|
df_ToAnnotate["SentenceRef"] = None |
|
|
df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if |
|
|
col != 'SentenceRef']] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1 |
|
|
df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(df_ToAnnotate[args.source_column]).transform('min').astype(int) |
|
|
df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tsk = "token-classification" |
|
|
|
|
|
pipe = None |
|
|
tokenizerGliner = None |
|
|
modelGliner = None |
|
|
modelGlinerBio = None |
|
|
if (("/gliner" in args.model_id) == False) and (("NCBO" in args.model_id) == False): |
|
|
pipe = pipeline( |
|
|
tsk, |
|
|
model=args.model_id, |
|
|
aggregation_strategy="simple", |
|
|
device=device, |
|
|
) |
|
|
elif ("/gliner" in args.model_id): |
|
|
if not tokenizerGliner: |
|
|
tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large') |
|
|
if "_bio-" in args.model_id: |
|
|
if args.num_cores_Gliner > 0: |
|
|
modelGlinerBio = GLiNER.from_pretrained(args.model_id) |
|
|
else: |
|
|
modelGlinerBio = GLiNER.from_pretrained(args.model_id, map_location=device) |
|
|
else: |
|
|
if args.num_cores_Gliner > 0: |
|
|
modelGliner = GLiNER.from_pretrained(args.model_id) |
|
|
else: |
|
|
modelGliner = GLiNER.from_pretrained(args.model_id, map_location=device) |
|
|
|
|
|
new_annotations = annotate(df_ToAnnotate, args, pipe, tokenizerGliner, modelGliner, modelGlinerBio, device) |
|
|
|
|
|
if not new_annotations.empty: |
|
|
if df_annotated.empty: |
|
|
|
|
|
df_annotated = new_annotations |
|
|
else: |
|
|
|
|
|
df_annotated = pd.concat([df_annotated, new_annotations], ignore_index=True) |
|
|
|
|
|
if not df_annotated.empty: |
|
|
df_annotated_combined = entitiesFusion(df_annotated,args) |
|
|
if strtobool(args.debug): |
|
|
print("\nStart is_cross_inside function ...") |
|
|
df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.75) |
|
|
else: |
|
|
df_annotated_combined = df_annotated |
|
|
|
|
|
|
|
|
|
|
|
if args.service_provider == "gptjrc": |
|
|
key_gptjrc = "" |
|
|
fkeyname = "GPTJRC-APItoken.key" |
|
|
if os.path.exists(fkeyname): |
|
|
with open(fkeyname) as f: |
|
|
key_gptjrc = f.read() |
|
|
else: |
|
|
key_gptjrc = os.environ['key_gptjrc'] |
|
|
if key_gptjrc and key_gptjrc != "": |
|
|
setup_gptjrc(key_gptjrc) |
|
|
|
|
|
|
|
|
cache_prefix_fp = "LLMQUERYNER" |
|
|
cache_nameLLMs = cache_prefix_fp + "___" + "__".join( |
|
|
[args.service_provider, args.model_name, str(args.temperature)]).replace( |
|
|
" ", "_") + ".json" |
|
|
|
|
|
load_map_query_input_output = None |
|
|
if strtobool(args.USE_CACHE): |
|
|
if os.path.exists(cache_nameLLMs): |
|
|
with open(cache_nameLLMs) as f: |
|
|
load_map_query_input_output = json.load(f) |
|
|
else: |
|
|
load_map_query_input_output = {} |
|
|
|
|
|
|
|
|
if strtobool(args.entity_linking): |
|
|
|
|
|
cache_map_geonames = None |
|
|
if strtobool(args.USE_CACHE): |
|
|
cacheGeonames_filename = "CACHE_geonames.json" |
|
|
if os.path.exists(cacheGeonames_filename): |
|
|
with open(cacheGeonames_filename) as f: |
|
|
cache_map_geonames = json.load(f) |
|
|
else: |
|
|
cache_map_geonames = {} |
|
|
|
|
|
key_geonames = "" |
|
|
if args.geonameskey_filename and os.path.exists(args.geonameskey_filename): |
|
|
fkeyname = args.geonameskey_filename |
|
|
with open(fkeyname) as f: |
|
|
key_geonames = f.read() |
|
|
else: |
|
|
key_geonames = os.environ['key_geonames'] |
|
|
|
|
|
cache_map_virtuoso = None |
|
|
if strtobool(args.USE_CACHE): |
|
|
cacheVirtuoso_filename = "CACHE_virtuoso.json" |
|
|
if os.path.exists(cacheVirtuoso_filename): |
|
|
with open(cacheVirtuoso_filename) as f: |
|
|
cache_map_virtuoso = json.load(f) |
|
|
else: |
|
|
cache_map_virtuoso = {} |
|
|
|
|
|
key_virtuoso = "" |
|
|
if args.virtuosokey_filename and os.path.exists(args.virtuosokey_filename): |
|
|
fkeyname = args.virtuosokey_filename |
|
|
with open(fkeyname) as f: |
|
|
key_virtuoso = f.read() |
|
|
else: |
|
|
key_virtuoso = os.environ['key_virtuoso'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef']) |
|
|
|
|
|
|
|
|
quoted_context = df_ToAnnotate[args.source_column].str.startswith('"') & df_ToAnnotate[ |
|
|
args.source_column].str.endswith('"') |
|
|
|
|
|
|
|
|
condition = missing_sentence_refs & quoted_context |
|
|
|
|
|
|
|
|
rows_to_add = df_ToAnnotate[condition].copy() |
|
|
|
|
|
rows_to_add['model'] = "Forced" |
|
|
rows_to_add['entity_group'] = "MISC" |
|
|
rows_to_add['word'] = rows_to_add[args.source_column] |
|
|
rows_to_add['word'] = rows_to_add[args.source_column].apply(strip_quotes) |
|
|
rows_to_add['score'] = 1.0 |
|
|
rows_to_add['start'] = int(1) |
|
|
rows_to_add['end'] = rows_to_add['word'].apply(len)+int(1) |
|
|
rows_to_add['IsGeo'] = None |
|
|
rows_to_add['IsBio'] = None |
|
|
rows_to_add['IsCrossInside'] = 0.0 |
|
|
|
|
|
|
|
|
df_annotated_combined = pd.concat([df_annotated_combined, rows_to_add], ignore_index=True) |
|
|
|
|
|
df_annotated_combined = df_annotated_combined.sort_values( |
|
|
by=['SentenceRef', 'start', 'ToLink', 'word', 'score'], ascending=[True, True, True, True, False]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = elinking(df_annotated_combined, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, device) |
|
|
|
|
|
if strtobool(args.USE_CACHE): |
|
|
if cache_map_geonames_AFTER is not None: |
|
|
with open(cacheGeonames_filename, "w") as f: |
|
|
json.dump(cache_map_geonames_AFTER, f) |
|
|
|
|
|
if cache_map_virtuoso_AFTER is not None: |
|
|
with open(cacheVirtuoso_filename, "w") as f: |
|
|
json.dump(cache_map_virtuoso_AFTER, f) |
|
|
|
|
|
if load_map_query_input_output_AFTER is not None: |
|
|
with open(cache_nameLLMs, "w") as f: |
|
|
json.dump(load_map_query_input_output_AFTER, f) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
all_emptyTriples = None |
|
|
all_emptyGlobalTriples = None |
|
|
if 'Triples' in df_annotated_combined.columns: |
|
|
all_emptyTriples = df_annotated_combined['Triples'].apply(lambda x: pd.isnull(x) or x == '' or x is None).all() |
|
|
|
|
|
if 'TriplesGlobal' in df_annotated_combined.columns: |
|
|
all_emptyGlobalTriples = df_annotated_combined['TriplesGlobal'].apply(lambda x: pd.isnull(x) or x == '' or x is None).all() |
|
|
|
|
|
|
|
|
if '@id' in df_annotated_combined.columns: |
|
|
df_annotated_combined = df_annotated_combined.rename(columns={'@id': '@idfromNCBO'}) |
|
|
|
|
|
|
|
|
if 'ALLURIScontextFromNCBO' in df_annotated_combined.columns: |
|
|
df_annotated_combined.drop('ALLURIScontextFromNCBO', axis=1, inplace=True) |
|
|
|
|
|
if (all_emptyTriples == False) or (all_emptyGlobalTriples==False): |
|
|
output_texts_filename = args.input_dir + args.filename.replace(".csv", "_OutputAnnotated-withTriples.csv") |
|
|
df_annotated_combined.to_csv(output_texts_filename, sep=',', header=True, index=False, encoding='utf-8') |
|
|
df_annotated_combined.drop('Triples', axis=1, inplace=True) |
|
|
df_annotated_combined.drop('TriplesGlobal', axis=1, inplace=True) |
|
|
|
|
|
output_texts_filename = args.input_dir + args.filename.replace(".csv", "_OutputAnnotated.csv") |
|
|
df_annotated_combined.to_csv(output_texts_filename, sep=',', header=True, index=False, encoding='utf-8') |
|
|
|
|
|
print("\nEnd script") |
|
|
|
|
|
|
|
|
end = time.time() |
|
|
hours, rem = divmod(end - start, 3600) |
|
|
minutes, seconds = divmod(rem, 60) |
|
|
print("Overall Computational Time... {:0>2}:{:0>2}:{:05.2f}\n".format(int(hours), int(minutes), seconds)) |
|
|
|
|
|
|