File size: 5,067 Bytes
3c36812
 
 
 
244b2f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from fastapi import FastAPI

app = FastAPI()

import json
from torch.utils.data.dataloader import DataLoader
import pandas as pd
import torch
from pydantic import BaseModel
from fastapi.responses import JSONResponse
import os

from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    default_data_collator,
    set_seed
)



DATA_DIR = r"./bank_intenet_model"
model_name_or_path = os.path.join(DATA_DIR, "model")
ar_id_to_label_file = os.path.join(DATA_DIR, "id_to_label_ar.json")
en_id_to_label_file = os.path.join(DATA_DIR, "id_to_label_en.json")
id_to_label_files ={
    'ar' : ar_id_to_label_file,
    'en' :en_id_to_label_file,
}
seed = 42
max_length=128
per_device_eval_batch_size = 16
use_slow_tokenizer = True
pad_to_max_length =True


def load_id_to_label(lang):
    """
    Loads a JSON file containing a dictionary of integer IDs mapped to string labels and returns a dictionary with integer
    keys and string values.

    Args:
    - lang (str): the selected language
    Returns:
    - id_to_label (dict): a dictionary with integer keys and string values based on the selected langauge.
    """

    
    json_file_path = id_to_label_files[lang]

    with open(json_file_path, "r", encoding="utf-8") as f:
        content_dict = json.load(f)
        return {int(key): value for key, value in content_dict.items()}



def pridct(my_list, lang):
    """
    Makes predictions on a list of texts using a Hugging Face model.

    Args:
    - my_list (list): a list of texts to make predictions on
    -lang (string): a language to select from for the predicated labels.

    Returns:
    - output_dict (dict): a dictionary containing the predicted labels, the corresponding texts, and the prediction
    probabilities
    """
    padding = "max_length" if pad_to_max_length else False
    tokenized_texts = tokenizer(my_list,
                                padding=padding,
                                truncation=True ,
                                add_special_tokens =True,
                                max_length=max_length,
                                return_tensors='pt')
 

    print("tokenized_texts : ",tokenized_texts)
    all_predictions = []
    all_probs = []
    all_texts = []
    model.eval()
    pt_inputs = {k: torch.tensor(v) for k, v in tokenized_texts.items()}
    with torch.no_grad():
        outputs = model(**pt_inputs)
 
    outputs.logits.cpu().numpy()    


    logits = outputs.logits
    print('logits',logits )
    predictions = outputs.logits.argmax(dim=-1)
    softmax_outputs = torch.nn.functional.softmax(outputs.logits, dim=1)
    all_predictions.extend(predictions.cpu().numpy().tolist())
    all_probs.extend(softmax_outputs.detach().cpu().numpy().tolist())
    #all_texts.extend([settings.tokenizer.decode(inp, skip_special_tokens=True) for inp in pt_inputs])

    id_to_label = load_id_to_label(lang)
    labeled_predictions = [id_to_label[pred] for pred in all_predictions]
    all_probs = [float(f"{max(prob):.3f}") for prob in all_probs]
    df = pd.DataFrame({"text": my_list, "predicted_ids":all_predictions, "predicted_label": labeled_predictions, "prob_value": all_probs})

    return df.to_dict()
    
def remove_empty_values(sentences):
    return [value for value in sentences if value != '']


def sent_tokenize(text, dot=True, new_line=True, question_mark=True, exclamation_mark=True):
    separators = []
    split_text = [text]
    if new_line==True:
        separators.append('\n')
    if dot==True:
        separators.append('. ')
    if question_mark==True:
        separators.append('?')
        separators.append('؟')
    if exclamation_mark==True:
        separators.append('!')
    
    for sep in separators:
        new_split_text = []
        for part in split_text:
            tokens = part.split(sep)
            tokens_with_separator = [token + sep for token in tokens[:-1]]
            tokens_with_separator.append(tokens[-1].strip())
            new_split_text.extend(tokens_with_separator)
        split_text = new_split_text
    
    split_text = remove_empty_values(split_text)    
    return split_text


config = AutoConfig.from_pretrained(model_name_or_path )
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=not use_slow_tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path,
from_tf=bool(".ckpt" in model_name_or_path ),
config=config)


class BankRequest(BaseModel):
    lang: str
    text: str

@app.post("/predict")
def predict(request: BankRequest):
    # Load tagger
    lang = request.lang  
    text = request.text  
    
    sentences = sent_tokenize(text, dot=True, new_line=True, question_mark=True, exclamation_mark=True)
    results = []

    sentence = sentences[0]
    
    result = pridct(sentence, lang)
    results.append(result)
    
    content = {"resp": results, "statusText": "OK", "statusCode": 0}

    return JSONResponse(
        content=content,
        media_type="application/json",
        status_code=200,
    )