[책요약]파이썬으로 배우는 응용텍스트 분석(CH1.언어와 계산) 코딩부분 #1
파이썬으로 배우는 응용텍스트 분석 (CH1.언어와 계산) 코딩부분 #1
P1 ~ P20까지
1. 남성형/여성형/남성형&여성형/알수없는경우와 단어 집합 구축
import nltk
nltk.download('punkt')
from collections import Counter
MALE = 'male'
FEMALE = 'female'
UNKNOWN = 'unknown'
BOTH = 'both'
MALE_WORDS = set([
'guy','spokesman','chairman',"men's",'men','him',"he's",'his',
'boy','boyfriend','boyfriends','boys','brother','brothers','dad',
'dads','dude','father','fathers','fiance','gentleman','gentlemen',
'god','grandfather','grandpa','grandson','groom','he','himself',
'husband','husbands','king','male','man','mr','nephew','nephews',
'priest','prince','son','sons','uncle','uncles','waiter','widower',
'widowers'
])
FEMALE_WORDS = set([
'heroine','spokeswoman','chairwoman',"women's",'actress','women',
"she's",'her','aunt','aunts','bride','daughter','daughters','female',
'fiancee','girl','girlfriend','girlfriends','girls','goddess',
'granddaughter','grandma','grandmother','herself','ladies','lady',
'mom','moms','mother','mothers','mrs','ms','niece','nieces',
'priestess','princess','queens','she','sister','sisters','waitress',
'widow','widows','wife','wives','woman'
])
[nltk_data] Downloading package punkt to
[nltk_data] C:\Users\heedory\AppData\Roaming\nltk_data...
[nltk_data] Package punkt is already up-to-date!
2. 남성형/여성형/남성형&여성형/알수없는경우 리턴해주기
def genderize(words):
mwlen = len(MALE_WORDS.intersection(words)) # MALE_WORDS와 word의 교집합
fwlen = len(FEMALE_WORDS.intersection(words))
if mwlen > 0 and fwlen == 0:
return MALE
elif mwlen == 0 and fwlen > 0:
return FEMALE
elif mwlen > 0 and fwlen > 0:
return BOTH
else:
return UNKNOWN
3. 문장들 돌면서 카운트 세기
def count_gender(sentences):
sents = Counter() # Counter({Male:2, Female:3}) 이런식으로 추가됨
words = Counter()
for sentence in sentences:
gender = genderize(sentence)
sents[gender] += 1
words[gender] += len(sentence) # len([1,2,3] -> 3
return sents, words
4. 토큰화하기 -> count_gender에서 수 뺴내서 퍼센트 구하고 프린트까지 하기
def parse_gender(text):
sentences = [
[word.lower() for word in nltk.word_tokenize(sentence)]
for sentence in nltk.sent_tokenize(text)
]
sents, words = count_gender(sentences)
total = sum(words.values())
for gender, count in words.items(): # dic_items 형식으로 나옴 [(Male,2).(Female,3)]
pcent = (count / total) * 100
nsents = sents[gender]
print(
"{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
)
if __name__ == '__main__':
with open('ballet.txt', 'r') as f:
parse_gender(f.read())