Commit 8958156c authored by Jim Hoekstra's avatar Jim Hoekstra 👋🏻
Browse files

Merge branch 'develop' into 'master'

Develop

See merge request !17
parents 4c49b267 bfffdacf
FROM python:3
RUN groupadd -r msx_user
RUN useradd --create-home -r -g msx_user msx_user
RUN groupadd -r msx_group
RUN useradd --create-home -r -g msx_group msx_user
ENV PYTHONUNBUFFERED=1
......
import dash_cytoscape as cyto
import pandas as pd
from math import factorial
class Graph:
......@@ -6,8 +8,10 @@ class Graph:
def __init__(self):
self.nodes = []
self.edges = []
self.COUNT_THRESHOLD = 2
self.MAX_NUM_WORDS = 10
self.N_INTERSECTING_WORDS = 5
def get_all_words(self):
all_words = [node_dict['data']['label'] for node_dict in self.nodes]
......@@ -85,6 +89,7 @@ class Graph:
elements=elements,
userZoomingEnabled=False,
userPanningEnabled=False,
maxZoom=2.0,
stylesheet=[
{
'selector': '[is_base_node > 0.5]',
......@@ -102,21 +107,58 @@ class Graph:
]
)
def extend_graph(self, word2vec_model, base_node):
def extend_graph(self, word2vec_model, base_node, words_to_exclude=None, weight_threshold=0.3):
current_words = [node['data']['id'] for node in self.nodes]
all_associated_words = []
for current_word in current_words:
associated_words = word2vec_model.get_associated_words(current_word, top_n=100)
all_associated_words.extend(associated_words)
associated_words_filtered = [word for word in all_associated_words if word not in current_words]
associated_words_count = {word: associated_words_filtered.count(word) for word in list(set(associated_words_filtered))}
if words_to_exclude is None:
words_to_exclude = current_words
count_threshold = self.COUNT_THRESHOLD
common_associated_words = [word for word, count in associated_words_count.items() if count >= count_threshold]
while len(common_associated_words) > self.MAX_NUM_WORDS:
count_threshold += 1
common_associated_words = [word for word, count in associated_words_count.items() if count >= count_threshold]
self.add_nodes(common_associated_words)
self.add_edges(base_node, common_associated_words)
all_associated_words = {}
for current_word in current_words:
associated_words = word2vec_model.get_associated_words(current_word, top_n=100)
all_associated_words[current_word] = associated_words
intersections_df = self.construct_intersection_df(current_words, all_associated_words)
weights = intersections_df['intersection'].value_counts()
weights = weights.rename_axis(['word'], axis='index')
weights = weights.reset_index(name='weight')
weights['weight'] = self.normalize_weights(weights['weight'], len(current_words))
if len(weights['word'].values) == 0:
return
exclude_words_filter = [True if word not in words_to_exclude else False for word in weights['word'].values]
weights = weights[exclude_words_filter]
weights_after_threshold = weights.loc[weights['weight'] > weight_threshold]
words_to_add = weights_after_threshold['word'].values
self.add_nodes(words_to_add)
self.add_edges(base_node, words_to_add)
def construct_intersection_df(self, current_words, all_associated_words):
word1s = []
word2s = []
intersections = []
for i in range(len(current_words) - 1):
for j in range(i + 1, len(current_words)):
similar_words_1 = all_associated_words[current_words[i]]
similar_words_2 = all_associated_words[current_words[j]]
intersections_for_words = [word for word in similar_words_1 if word in similar_words_2][
:self.N_INTERSECTING_WORDS]
if len(intersections_for_words) > 0:
for intersection_for_words in intersections_for_words:
word1s.append(current_words[i])
word2s.append(current_words[j])
intersections.append(intersection_for_words)
return pd.DataFrame.from_dict({'word1': word1s, 'word2': word2s, 'intersection': intersections})
@staticmethod
def normalize_weights(weights, number_of_words):
if number_of_words >= 2:
number_of_combinations = factorial(number_of_words) / (2 * factorial(number_of_words - 2))
return weights / number_of_combinations
else:
return weights
......@@ -10,9 +10,20 @@ class AssociatedWords:
print(" Word2Vec model is ready. Enjoy!!!\n")
def get_associated_words(self, word, top_n=10):
gensim_result = self.model.most_similar(word, topn=top_n)
# gensim_result = [('apple', 1.0), ('banana', 1.0), ('strawberry', 1.0)]
words = self.filter_results(gensim_result, word)
lowercase_word = word.lower()
gensim_result = self.model.most_similar(lowercase_word, topn=top_n)
# if word == 'fruit':
# gensim_result = [('apple', 1.0), ('banana', 1.0), ('strawberry', 1.0)]
# elif word == 'apple':
# gensim_result = [('fruit', 1.0), ('juice', 1.0), ('tree', 1.0)]
# elif word == 'banana':
# gensim_result = [('fruit', 1.0), ('smoothie', 1.0), ('tree', 1.0)]
# elif word == 'strawberry':
# gensim_result = [('fruit', 1.0), ('smoothie', 1.0), ('berry', 1.0)]
# else:
# gensim_result = []
words = self.filter_results(gensim_result, lowercase_word)
return words
def filter_results(self, gensim_result, base_word):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment