Source code for tidyX.spacy_preprocessor

import tqdm
import spacy
from typing import List, Union, Tuple
from spacy.lang.es import Spanish
from spacy.language import Language
from .text_preprocessor import TextPreprocessor 
import warnings
import spacy_spanish_lemmatizer

[docs]class SpacyPreprocessor: def __init__(self): pass
[docs] @staticmethod def spanish_lemmatizer(token: str, model: Spanish) -> str: """Lemmatizes a given token using Spacy's Spanish language model. Note: Before using this function, a Spacy model for Spanish should be downloaded. Use `python -m spacy download name_of_model` to download a model. Available models: "es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf". For more information, visit https://spacy.io/models/es Args: token (str): The token to be lemmatized. model (spacy.lang.es.Spanish): A Spacy language model object. Returns: str: The lemmatized version of the token, with accents removed. """ import spacy if not token: return token try: lemma = model(token)[0].lemma_ lemma = TextPreprocessor.remove_accents(lemma) return lemma except Exception as e: print(f"An error occurred: {e}") return token
[docs] def is_component_registered(name: str) -> bool: """ Check if a spaCy pipeline component with the given name is already registered. Args: name (str): The name of the spaCy pipeline component. Returns: conditional (bool) True if the component is already registered. False otherwise. """ return name in Language.factories
[docs] def register_component(): """ Conditionally register the custom_lemmatizer component. """ if not SpacyPreprocessor.is_component_registered("custom_lemmatizer"): @Language.factory("custom_lemmatizer") def custom_lemmatizer(nlp: Spanish, name: str) -> 'spacy_spanish_lemmatizer.main.create_spanish_lemmatizer': """ Creates and returns a Spanish rule-based lemmatizer for spaCy. This factory function leverages the `spacy-spanish-lemmatizer` package to provide a rule-based lemmatizer for the Spanish language, enhancing the spaCy pipeline's capability to extract the base or dictionary form of a word, which is crucial for various NLP tasks like text normalization, text analysis, and information retrieval. For additional details on the lemmatizer, refer to: https://github.com/pablodms/spacy-spanish-lemmatizer Args: nlp (spacy.lang.es.Spanish): The spaCy language model object. name (str): The name of the lemmatizer, utilized by spaCy to register the component. Returns: spacy_spanish_lemmatizer.main.create_spanish_lemmatizer: A Spanish rule-based lemmatizer for spaCy. Example: >>> import spacy >>> nlp = spacy.load('es_core_news_sm') >>> nlp.add_pipe('custom_lemmatizer', name='lemmatizer') >>> doc = nlp("El gato está en la casa") >>> [token.lemma_ for token in doc] ['El', 'gato', 'estar', 'en', 'el', 'casa'] """ return spacy_spanish_lemmatizer.main.create_spanish_lemmatizer(nlp, name)
[docs] @staticmethod def spacy_pipeline(documents: List[str], custom_lemmatizer: bool = False, pipeline: List[str] = ['tokenize', 'lemmatizer'], stopwords_language: str = 'Spanish', model: str = 'es_core_news_sm', num_strings: int = 0) -> Union[List[List[str]], Tuple[List[List[str]], List[Tuple[str, int]]]]: """ Processes documents through the spaCy pipeline, performing lemmatization and stopword removal, and optionally utilizing a custom lemmatizer for Spanish. For further information on the custom lemmatizer, refer to: https://github.com/pablodms/spacy-spanish-lemmatizer Note: Ensure the relevant spaCy model is downloaded using: ```sh python -m spacy download <model_name> ``` where <model_name> can be "es_core_news_sm", "es_core_news_md", "es_core_news_lg", or "es_dep_news_trf". Args: documents (List[str]): A list of texts to be processed. custom_lemmatizer (bool, optional): If True, a custom Spanish rule-based lemmatizer is added to the pipeline. pipeline (List[str], optional): A list of spaCy pipeline components for processing the documents. Defaults to ['tokenize', 'lemmatizer']. stopwords_language (str, optional): Language for the nltk stopwords list. Defaults to 'Spanish'. model (str, optional): spaCy model to be used. Defaults to 'es_core_news_sm'. num_strings (int, optional): Number of most common strings to return. If 0, only processed documents are returned. Defaults to 0. Returns: Union[List[List[str]], Tuple[List[List[str]], List[Tuple[str, int]]]]: A list of processed documents and, if num_strings > 0, a list of the most common strings in the documents. Raises: ValueError: If the documents list is empty. """ import spacy import nltk from nltk.corpus import stopwords if not documents: raise ValueError("The documents list must not be empty.") try: nlp = spacy.load(model, disable=[comp for comp in spacy.lang.es.LANGUAGES_DICT['es'].pipe_names if comp not in pipeline]) except: default_pipeline = ['tagger', 'parser', 'ner', 'lemmatizer'] # Adjust this list based on the components of the specific model you're using nlp = spacy.load(model, disable=[comp for comp in default_pipeline if comp not in pipeline]) # Download resources nltk.download('stopwords') spanish_stopwords = stopwords.words(stopwords_language) if custom_lemmatizer: custom_lemmatizer_name = "custom_lemmatizer" if 'lemmatizer' in nlp.pipe_names: nlp.replace_pipe("lemmatizer", custom_lemmatizer_name) else: nlp.add_pipe(custom_lemmatizer_name, name=custom_lemmatizer_name, last=True) processed_documents = [ [TextPreprocessor.remove_accents(token.lemma_) for token in nlp(doc) if token.text not in spanish_stopwords and token.lemma_ not in spanish_stopwords] for doc in tqdm.tqdm(documents, total=len(documents)) ] if num_strings > 0: most_common_words = TextPreprocessor.get_most_common_strings(processed_documents, num_strings) return processed_documents, most_common_words return processed_documents
SpacyPreprocessor.register_component()