Requirement already satisfied: konlpy in /usr/local/lib/python3.7/dist-packages (0.5.2)
Requirement already satisfied: beautifulsoup4==4.6.0 in /usr/local/lib/python3.7/dist-packages (from konlpy) (4.6.0)
Requirement already satisfied: JPype1>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from konlpy) (1.3.0)
Requirement already satisfied: colorama in /usr/local/lib/python3.7/dist-packages (from konlpy) (0.4.4)
Requirement already satisfied: numpy>=1.6 in /usr/local/lib/python3.7/dist-packages (from konlpy) (1.19.5)
Requirement already satisfied: tweepy>=3.7.0 in /usr/local/lib/python3.7/dist-packages (from konlpy) (3.10.0)
Requirement already satisfied: lxml>=4.1.0 in /usr/local/lib/python3.7/dist-packages (from konlpy) (4.2.6)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from JPype1>=0.7.0->konlpy) (3.10.0.2)
Requirement already satisfied: requests[socks]>=2.11.1 in /usr/local/lib/python3.7/dist-packages (from tweepy>=3.7.0->konlpy) (2.23.0)
Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from tweepy>=3.7.0->konlpy) (1.3.0)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.7/dist-packages (from tweepy>=3.7.0->konlpy) (1.15.0)
Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->tweepy>=3.7.0->konlpy) (3.1.1)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2021.10.8)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (3.0.4)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.24.3)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2.10)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.7/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.7.1)
Requirement already satisfied: nltk in /usr/local/lib/python3.7/dist-packages (3.2.5)
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from nltk) (1.15.0)
2) Bag of Words(BoW)
from konlpy.tag import Oktimport re okt = Okt() # 정규 표현식을 통해 온점을 제거하는 정제 작업. token = re.sub("(\.)","","정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다.") # OKT 형태소 분석기를 통해 토큰화 작업을 수행한 뒤에, token에다가 넣음. token = okt.morphs(token) word2index = {} bow = [] for voca in token: # token을 읽으면서, word2index에 없는 (not in) 단어는 새로 추가하고, 이미 있는 단어는 넘깁니다. if voca notin word2index.keys(): word2index[voca] =len(word2index) # BoW 전체에 전부 기본값 1을 넣는다. bow.insert(len(word2index)-1,1)else:# 재등장하는 단어의 인덱스 index = word2index.get(voca)# 재등장한 단어는 해당하는 인덱스의 위치에 1을 더한다. bow[index] = bow[index]+1print(word2index)
from sklearn.feature_extraction.text import CountVectorizercorpus = ['you know I want your love. because I love you.']vector = CountVectorizer()# 코퍼스로부터 각 단어의 빈도수를 기록print(vector.fit_transform(corpus).toarray()) # 각 단어의 인덱스가 어떻게 부여되었는지를 출력print(vector.vocabulary_)
from sklearn.feature_extraction.text import CountVectorizertext = ["Family is not an important thing. It's everything."]vect = CountVectorizer(stop_words=["the", "a", "an", "is", "not"])print(vect.fit_transform(text).toarray()) print(vect.vocabulary_)
from sklearn.feature_extraction.text import CountVectorizertext = ["Family is not an important thing. It's everything."]vect = CountVectorizer(stop_words="english")print(vect.fit_transform(text).toarray())print(vect.vocabulary_)
from sklearn.feature_extraction.text import CountVectorizerfrom nltk.corpus import stopwordsimport nltknltk.download('stopwords')text = ["Family is not an important thing. It's everything."]sw = stopwords.words("english")vect = CountVectorizer(stop_words =sw)print(vect.fit_transform(text).toarray()) print(vect.vocabulary_)
import pandas as pd # 데이터프레임 사용을 위해from math import log # IDF 계산을 위해
docs = ['먹고 싶은 사과','먹고 싶은 바나나','길고 노란 바나나 바나나','저는 과일이 좋아요'] vocab =list(set(w for doc in docs for w in doc.split()))vocab.sort()N =len(docs) # 총 문서의 수def tf(t, d):return d.count(t)def idf(t): df =0for doc in docs: df += t in docreturn log(N/(df +1))def tfidf(t, d):return tf(t,d)* idf(t)result = []for i inrange(N): # 각 문서에 대해서 아래 명령을 수행 result.append([]) d = docs[i]for j inrange(len(vocab)): t = vocab[j] result[-1].append(tf(t, d))tf_ = pd.DataFrame(result, columns = vocab)tf_
과일이
길고
노란
먹고
바나나
사과
싶은
저는
좋아요
0
0
0
0
1
0
1
1
0
0
1
0
0
0
1
1
0
1
0
0
2
0
1
1
0
2
0
0
0
0
3
1
0
0
0
0
0
0
1
1
result = []for j inrange(len(vocab)): t = vocab[j] result.append(idf(t))idf_ = pd.DataFrame(result, index = vocab, columns = ["IDF"])idf_
IDF
과일이
0.693147
길고
0.693147
노란
0.693147
먹고
0.287682
바나나
0.287682
사과
0.693147
싶은
0.287682
저는
0.693147
좋아요
0.693147
result = []for i inrange(N): result.append([]) d = docs[i]for j inrange(len(vocab)): t = vocab[j] result[-1].append(tfidf(t,d))tfidf_ = pd.DataFrame(result, columns = vocab)tfidf_
과일이
길고
노란
먹고
바나나
사과
싶은
저는
좋아요
0
0.000000
0.000000
0.000000
0.287682
0.000000
0.693147
0.287682
0.000000
0.000000
1
0.000000
0.000000
0.000000
0.287682
0.287682
0.000000
0.287682
0.000000
0.000000
2
0.000000
0.693147
0.693147
0.000000
0.575364
0.000000
0.000000
0.000000
0.000000
3
0.693147
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.693147
0.693147
from sklearn.feature_extraction.text import CountVectorizercorpus = ['you know I want your love','I like you','what should I do ', ]vector = CountVectorizer()print(vector.fit_transform(corpus).toarray()) # 코퍼스로부터 각 단어의 빈도 수를 기록한다.print(vector.vocabulary_) # 각 단어의 인덱스가 어떻게 부여되었는지를 보여준다.
from sklearn.feature_extraction.text import TfidfVectorizercorpus = ['you know I want your love','I like you','what should I do ', ]tfidfv = TfidfVectorizer().fit(corpus)print(tfidfv.transform(corpus).toarray())print(tfidfv.vocabulary_)