Before staring work, download packages.
pip install numpy scipy matplotlib ipython scikit-learn pandas wordcloud nltk gensim bokeh
Then, download nltk stopwords package.
> python
$ import nltk
$ nltk.download(‘stopwords’)
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib
from matplotlib import pyplot as plt
import os
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
data = pd.read_csv("./songs.csv")
data
def formatted_text(text):
'''a function for removing punctuation & whitespace characters'''
import string
# replacing the punctuations with no space,
# which in effect deletes the punctuation marks
translator = str.maketrans('', '', string.punctuation)
# return the text stripped of punctuation marks
result = text.translate(translator)
#result = result.replace("\n","")
return result.lower()
data['lyrics'] = data['lyrics'].apply(formatted_text)
data['lyrics']
def length(text):
'''a function which returns the length of text'''
return len(str(text))
data['length'] = data['lyrics'].apply(length)
data
# matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
bins = 500
plt.hist(list(data['length']), alpha = 1, label='Songs')
plt.xlabel('length')
plt.ylabel('numbers')
plt.grid()
plt.legend(loc='upper right')
plt.show()
# extracting the stopwords from nltk library
sw = stopwords.words('english')
# displaying the stopwords
np.array(sw)
print("Number of stopwords: ", len(sw))
def stopwords(text):
'''a function for removing the stopword'''
# removing the stop words and lowercasing the selected words
text = [word.lower() for word in text.split() if word.lower() not in sw]
# joining the list of words with space separator
return " ".join(text)
data['lyrics'] = data['lyrics'].apply(stopwords)
# create a count vectorizer object
count_vectorizer = CountVectorizer()
# fit the count vectorizer using the text data
count_vectorizer.fit(data['lyrics'])
# collect the vocabulary items used in the vectorizer
dictionary = count_vectorizer.vocabulary_.items()
# lists to store the vocab and counts
vocab = []
count = []
# iterate through each vocab and count append the value to designated lists
for key, value in dictionary:
vocab.append(key)
count.append(value)
# store the count in panadas dataframe with vocab as index
vocab_bef_stem = pd.Series(count, index=vocab)
# sort the dataframe
vocab_bef_stem = vocab_bef_stem.sort_values(ascending=False)
top_vacab = vocab_bef_stem.head(10)
top_vacab
# create an object of stemming function
stemmer = SnowballStemmer("english")
def stemming(text):
'''a function which stems each word in the given text'''
text = [stemmer.stem(word) for word in text.split()]
return " ".join(text)
data['lyrics'] = data['lyrics'].apply(stemming)
top_vacab
%%time
text = " ".join(e for e in data['lyrics'])
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white", width=700).generate(text)
# Save image file
wordcloud.to_file("./bag_of_words.png")
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
# lists to store the vocab and counts
vocab = []
count = []
# iterate through each vocab and count append the value to designated lists
for key, value in dictionary:
vocab.append(key)
count.append(value)
# store the count in panadas dataframe with vocab as index
vocab_after_stem = pd.Series(count, index=vocab)
# sort the dataframe
vocab_after_stem = vocab_after_stem.sort_values(ascending=False)
print("total vacab is", len(vocab_after_stem))
count = CountVectorizer()
bag_of_words = count.fit_transform(np.array(data['lyrics']))
# Show feature matrix
bag_of_words.toarray()
# Get feature names
feature_names = count.get_feature_names()
print("feature_names", len(feature_names))
bag_of_wrods_dataFrame = pd.DataFrame(bag_of_words.toarray(), columns=feature_names)
pd.concat([data.year, bag_of_wrods_dataFrame], axis=1, sort=False)
mostly_used_word = bag_of_wrods_dataFrame.sum(axis=0).sort_values(ascending=False)
mostly_used_word_dataFrame = pd.DataFrame(data=mostly_used_word, columns=["count"])
mostly_used_word_dataFrame.head(20)
# Singapore, Together, One, Dream, Home
pd.concat([data.year, bag_of_wrods_dataFrame.singapor, bag_of_wrods_dataFrame.togeth, bag_of_wrods_dataFrame.one, bag_of_wrods_dataFrame.dream, bag_of_wrods_dataFrame.home], axis=1, sort=False)
# create the object of tfid vectorizer
tfid_vectorizer = TfidfVectorizer(analyzer='word', min_df = 0, stop_words = 'english')
# fit the vectorizer using the text data
tfid_vectorizer.fit(data['lyrics'])
# collect the vocabulary items used in the vectorizer
dictionary = tfid_vectorizer.vocabulary_.items()
# extract the tfid representation matrix of the text data
tfid_matrix = tfid_vectorizer.transform(data['lyrics'])
# collect the tfid matrix in numpy array
array = tfid_matrix.todense()
# store the tf-idf array into pandas dataframe
similiarity_df = pd.DataFrame(array, columns=tfid_vectorizer.get_feature_names())
pd.concat([data.year, similiarity_df], axis=1, sort=False)
Use Scikit learn Cosine Similarity function to compare the first document i.e. Document 0 with the other Documents in Corpus.
Find songs with similar with Because it's Singapore
song_number = 24
current_song = tfid_matrix[song_number-1:song_number]
similarity = cosine_similarity(tfid_matrix[song_number-1:song_number].toarray(), tfid_matrix.toarray())
print("Song: ", data['title'][song_number-1])
# sort the dataframe
similarity_dataFrame = pd.DataFrame(data=similarity.flatten(), columns=["cosine_similarity"])
similarity_dataFrame['title'] = data['title']
similarity_dataFrame['year'] = data['year']
similarity_dataFrame.sort_values(by=['cosine_similarity'], ascending=False)
current_directory = os.getcwd()
final_directory = os.path.join(current_directory, r'word2vec')
if not os.path.exists(final_directory):
os.makedirs(final_directory)
import gensim
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim import corpora, models
from gensim.models import LdaMulticore
from gensim.models import Word2Vec
from gensim.corpora import Dictionary, MmCorpus
data = pd.read_csv("./songs.csv")
all_sentences_normalized_filepath = 'word2vec/all_lyrics_text.txt'
with open(all_sentences_normalized_filepath) as foo:
lines = len(foo.readlines())
print("total lines: ", lines)
with open(all_sentences_normalized_filepath, 'w', encoding='utf-8') as f:
c = len(data.lyrics.values)
for lyrics in data.lyrics.values:
if pd.isnull(lyrics): # if there is null, go to next
continue
f.write(formatted_text(lyrics) + '\n')
%%time
USE_PREMADE_BIGRAM_MODEL = False
all_bigram_model_filepath = 'word2vec/all_bigram_model'
all_unigram_sentences = LineSentence(all_sentences_normalized_filepath)
if not USE_PREMADE_BIGRAM_MODEL:
all_bigram_model = Phrases(all_unigram_sentences) #check whether phrase or not
all_bigram_model.save(all_bigram_model_filepath)
else:
all_bigram_model = Phrases.load(all_bigram_model_filepath)
print(all_sentences_normalized_filepath)
%%time
USE_PREMADE_BIGRAM_SENTENCES = False
all_bigram_sentences_filepath = 'word2vec/all_sentences_for_word2vec.txt'
if not USE_PREMADE_BIGRAM_SENTENCES:
with open(all_bigram_sentences_filepath, 'w', encoding='utf-8') as f:
for unigram_sentence in all_unigram_sentences:
all_bigram_sentence = all_bigram_model[unigram_sentence]
f.write(' '.join(all_bigram_sentence) + '\n')
else:
assert path.exists(all_bigram_sentences_filepath)
%%time
USE_PREMADE_WORD2VEC = False
all2vec_filepath = 'word2vec/all_word2vec_model'
if not USE_PREMADE_WORD2VEC:
lyrics_for_word2vec = LineSentence(all_bigram_sentences_filepath)
all2vec = Word2Vec(lyrics_for_word2vec, size=100, window=5, min_count=1, sg=1)
for _ in range(25):
all2vec.train(lyrics_for_word2vec, total_examples=675, epochs=30)
all2vec.save(all2vec_filepath)
else:
all2vec = Word2Vec.load(all2vec_filepath)
all2vec.init_sims()
all2vec_filepath = 'word2vec/all_word2vec_model'
all2vec = Word2Vec.load(all2vec_filepath)
wv_dataFrame = pd.DataFrame(all2vec.wv.index2word)
wv_dataFrame.head()
all2vec.wv.most_similar(positive=['singapore'], topn=20)
all2vec_dic = dict({})
for idx, key in enumerate(all2vec.wv.vocab):
all2vec_dic[key] = all2vec.wv[key].tolist()
all2vec_dataFrame = pd.DataFrame.from_dict(all2vec_dic).T
all2vec_dataFrame.head()
%%time
from sklearn.manifold import TSNE
import pickle
USE_PREMADE_TSNE = False
tsne_filepath = 'example/tsne.pkl'
if not USE_PREMADE_TSNE:
tsne = TSNE(random_state=0)
tsne_points = tsne.fit_transform(all2vec_dataFrame)
with open(tsne_filepath, 'wb') as f:
pickle.dump(tsne_points, f)
else:
with open(tsne_filepath, 'rb') as f:
tsne_points = pickle.load(f)
tsne_df = pd.DataFrame(tsne_points, index=all2vec_dataFrame.index, columns=['x_coord', 'y_coord'])
tsne_df['word'] = tsne_df.index
tsne_df.head()
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, LabelSet, value
# prepare the data in a form suitable for bokeh.
plot_data = ColumnDataSource(data=tsne_df)
# create the plot and configure it
tsne_plot = figure(title='t-SNE Word Embeddings',
plot_width = 800,
plot_height = 800,
active_scroll='wheel_zoom'
)
tsne_plot.circle('x_coord', 'y_coord', source=plot_data,
color='red', line_alpha=0.2, fill_alpha=0.1,
size=10, hover_line_color='orange')
labels = LabelSet(x='x_coord', y='y_coord', text='word', level='glyph',
x_offset=5, y_offset=5, source=plot_data, render_mode='canvas')
# adjust visual elements of the plot
tsne_plot.title.text_font_size = value('16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None
tsne_plot.add_layout(labels)
# show time
output_notebook()
show(tsne_plot);