I'm trying use text normalization, and prepend the prefix NOT_ to every word after a token of logical negation, specifically for the sentence "didn't like this movie , but I."
By doing this, it creates the output:
"didn't NOT_like NOT_this NOT_movie , but I"
however I'm getting the output:
"didnt like NOT_didnt NOT_like this NOT_like NOT_this NOT_didnt NOT_like NOT_this movie NOT_this NOT_movie NOT_like NOT_this NOT_movie but NOT_movie NOT_but NOT_this NOT_movie NOT_but i NOT_but NOT_i NOT_movie NOT_but NOT_i".
I feel like I must be doing something obviously wrong--I'll past my code in below. Not being able to figure out what I'm doing wrong is driving me crazy.
def preprocess_text(text): text = text.lower() text = re.sub(r'[^\w\s\']', '', text) return text text = text.lower() text = re.sub(r'[^\w\s\']', '', text) return texttext = "didn’t like this movie, but I"def negify(text): negation_terms = {"not", "n't", "no"} punctuation_marks = {".", ",", "!", "?", ";"} words = text.split() negated_text = [] negation_active = False result = [] for word in words: word_stripped = word.strip('.,?!:;').lower() if negation_active: negated_text.append("not_" + word_stripped) else: negated_text.append(word_stripped) result.append(word_stripped) if len(negated_text) > 1: bigram = " ".join(negated_text[-2:]) result.append(bigram) if len(negated_text) > 2: trigram = " ".join(negated_text[-3:]) result.append(trigram) if any(neg in word.lower() for neg in negation_terms): negation_active = not negation_active if any(p in word for p in punctuation_marks): negation_active = False return " ".join(result)def binarize(text): words = text.split() binarized_words = [] negation_active = False for word in words: if word.lower() == "didn't": binarized_words.append(word) words = text.split() unique_words = set() binarized_words = [] for word in words: if word.lower() not in unique_words: unique_words.add(word.lower()) binarized_words.append(word) else: binarized_words.append('NOT_'+ word) binarized_text = ''.join(binarized_words) return binarized_texttext = preprocess_text(text)text = negify(text)binarized_output = binarize(text)print(binarized_output)``didnt like NOT_didnt NOT_like this NOT_like NOT_this NOT_didnt NOT_like NOT_this movie NOT_this NOT_movie NOT_like NOT_this NOT_movie but NOT_movie NOT_but NOT_this NOT_movie NOT_but i NOT_but NOT_i NOT_movie NOT_but NOT_i