python에서 한글 전처리를 하는 모음
from collections import Counter
special_chars = ['\n', '?', '.', '+', '~', '-', '_', ',', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}', '[', ']' ,'/', '=', '`', '|']
def string_cleanup(x, notwanted):
# import re
for item in notwanted:
x = x.replace(item, ' ')
# x = re.sub(item, '', x)
return x
def multiple_spaces_to_one(sentence):
import re
return re.sub(' +', ' ', sentence)
def remove_duplicated_words(sentence):
return ' '.join(set(text.split(' ')))
def preprocessing(sentence):
sentence = string_cleanup(sentence, special_chars)
sentence = re.compile('[0-9|ㄱ-ㅎ|ㅏ-ㅣ]+').sub('',sentence) # 'ㅋㅋㅋ', 'ㅏㅏ 제거'
sentence = sentence.strip()
sentence = sentence.lower()
sentence = multiple_spaces_to_one(sentence)
sentence = ' '.join(Counter(text.split(' ')).keys())
return sentence
def preprocessing_udf(x):
text = preprocessing(x['context'])
return text
result_df.head(2).apply(preprocessing_udf, axis=1)