Source code for factfinder.src.text_classifier_topics

import pandas as pd
from transformers import pipeline


[docs]class TextClassifierTopics: """ This class is aimed to classify input texts into themes, or structured types of events. It uses a Huggingface transformer model trained on rubert-tiny. In many cases count of messages per theme was too low to efficiently train, so we used synthetic themes based on the categories as upper level (for example, 'unknown_ЖКХ') """ def __init__( self, repository_id="Sandrro/text_to_subfunction_v10", number_of_categories=1, device_type=None, ): self.REP_ID = repository_id self.CATS_NUM = number_of_categories self.classifier = pipeline( "text-classification", model=self.REP_ID, tokenizer="cointegrated/rubert-tiny2", max_length=2048, truncation=True, device=device_type, )
[docs] def run(self, t): """ This method takes a text as input and returns the predicted themes and probabilities. :param t: text to classify :return: list of predicted themes and probabilities """ preds = pd.DataFrame(self.classifier(t, top_k=self.CATS_NUM)) self.classifier.call_count = 0 if self.CATS_NUM > 1: cats = "; ".join(preds["label"].tolist()) probs = "; ".join(preds["score"].round(3).astype(str).tolist()) else: cats = preds["label"][0] probs = preds["score"].round(3).astype(str)[0] return [cats, probs]