from wordcloud import WordCloud

시사경제지식

by 브로마리 2023. 8. 28. 09:22

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# 텍스트 파일을 읽어옵니다.
with open('input.txt', 'r', encoding='utf-8') as file:
text = file.read()

# 불용어(stop words)를 로드합니다.
stop_words = set(stopwords.words('english')) # 원하는 언어로 변경 가능

# 텍스트를 토큰화하고 불용어를 제거합니다.
words = word_tokenize(text)
filtered_words = [word for word in words if word.lower() not in stop_words]

# 단어 빈도를 계산합니다.
word_freq = nltk.FreqDist(filtered_words)

# 워드 클라우드를 생성합니다.
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

# 워드 클라우드를 표시합니다.
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')

# 워드 클라우드를 이미지 파일로 저장합니다.
wordcloud.to_file('wordcloud.png')

# 워드 클라우드 데이터를 데이터프레임으로 변환합니다.
wordcloud_data = pd.DataFrame.from_dict(word_freq, orient='index', columns=['Frequency'])

# 워드 클라우드 데이터를 엑셀 파일로 저장합니다.
wordcloud_data.to_excel('wordcloud_data.xlsx')

# 결과를 표시합니다.
plt.show()

저작자표시 비영리 변경금지 (새창열림)

'시사경제지식' 카테고리의 다른 글

from selenium.webdriver.common.keys import Keys (0)	2023.08.29
import matplotlib.pyplot as plt (0)	2023.08.28
soup = BeautifulSoup(response.text, 'html.parser') (0)	2023.08.26
with open(file_path, 'r', encoding='utf-8') as file: (0)	2023.08.26
def generate_random_matrix(): (0)	2023.08.26