Commit d74e6bc2 authored by Jim Hoekstra's avatar Jim Hoekstra 👋🏻
Browse files

issues MSX-29 and MSX-30

parent beeb826a
# -*- coding: utf-8 -*-
"""
Staring date: 09-02-21
Code for MSX: Extension of candidate words.
Functions:
words_to_sets: Check existent word2vec model for words similarities. Also
prune the similar words by string distances.
Input: set of words
Output: dictionary with original + similar words + similarity scores
sets_to_network: Converts part of the output of words_to_sets function into 2 objects
to be loaded in dash-cytoscape.
Input: dictionary with original + similar words
Ouput: two arrays to be used in dash-cyto
Codes include the use of an existent
word2vec pretrained model and string/substring distance.
Author: Julian Bianco Martinez
"""
### Loading model
import gensim.downloader as api
# Different Pre-trained Embeddings
#model = api.load('word2vec-google-news-300')
model = api.load('glove-twitter-200')
model.most_similar('people', topn=100)
terms = """Jargon
Domain
Standardisation
Interactive tool
word2vec
NLP
Collaboration
Vocabulary
Graph
Meaning
Tool
Communication
People
Terms
Associations
Efficient"""
words_set = terms
#%% Functions
def words_to_sets(words_set):
# Similar words
from tqdm import tqdm
import stringdist as sdi
terms = words_set.lower().split("\n")
terms = [word.replace(' ','_') for word in terms]
similar_terms = dict()
words_not_vocabulary = []
for term in tqdm(terms):
try:
temp_tuple = model.most_similar(term, topn=20)
r=-1
for i,j in temp_tuple:
r += 1
temp_tuple[r] = temp_tuple[r] + (sdi.rdlevenshtein_norm(term,i),) + (i.find(term),)
similar_terms[term] = temp_tuple
similar_terms[term] = [(term, score, string_dist, substring) for term, score, string_dist, substring in similar_terms[term] if term == term.lower()
and "www" not in term
and string_dist > 0.5
and substring == -1][:10]
except (KeyError) as e:
words_not_vocabulary.append(term)
return similar_terms, words_not_vocabulary
def sets_to_network(similar_terms):
### CytoScape structure Creation
edges = []
nodes = []
for item in similar_terms:
if item not in nodes:
nodes.append({"data": {"id": item, "label": item},"classes" : 'followerNode'})
for i in similar_terms[item]:
if i not in nodes:
nodes.append({"data": {"id": i[0], "label": i[0]},"classes" : 'followingNode'})
temp = {'data':{
'source': item,
'target': i[0]
}
}
edges.append(temp)
return nodes, edges
similar_terms, words_not_vocabulary = words_to_sets(terms)
nodes, edges = sets_to_network(similar_terms)
### END Functions
#%% Dash
import dash
import dash_cytoscape as cyto
import dash_html_components as html
#####Cyto Code
app = dash.Dash(__name__)
# define layout
default_stylesheet = [
{
"selector": 'node',
"style":{
"label": "data(label)"
}
},
{
'selector': '.followerNode',
'style': {
'background-color': '#00AAD9'
}
}
]
app.layout = html.Div([
cyto.Cytoscape(
id='test',
#style={'width': '100%', 'height': '400px'},
layout = {'name': 'cose'},
elements = edges + nodes,
stylesheet=default_stylesheet
)
])
if __name__ == '__main__':
app.run_server(debug=False)
\ No newline at end of file
......@@ -34,6 +34,7 @@ def update_base_word(submit_word_button, base_word_input):
Output(component_id='graph-elements-div', component_property='children'),
Input(component_id='submit-word-button', component_property='n_clicks'),
Input(component_id='add-word-button', component_property='n_clicks'),
Input(component_id='extend-graph-button', component_property='n_clicks'),
Input(component_id='remove-word-button', component_property='n_clicks'),
State(component_id='base-word-input', component_property='value'),
State(component_id='add-word-input', component_property='value'),
......@@ -42,8 +43,8 @@ def update_base_word(submit_word_button, base_word_input):
State(component_id='base-word-div', component_property='children'),
prevent_initial_call=True
)
def update_graph_elements(submit_word_button, add_word_button, remove_word_button, base_word_input, add_word_input,
nodes_and_edges, selected_nodes, base_word_state):
def update_graph_elements(submit_word_button, add_word_button, extend_graph_button, remove_word_button, base_word_input,
add_word_input, nodes_and_edges, selected_nodes, base_word_state):
callback_context = dash.callback_context
button_id = callback_context.triggered[0]['prop_id'].split('.')[0]
......@@ -65,6 +66,12 @@ def update_graph_elements(submit_word_button, add_word_button, remove_word_butto
else:
raise PreventUpdate
if button_id == 'extend-graph-button':
graph.set_nodes_and_edges(json.loads(nodes_and_edges))
graph.extend_graph(word2vec_model, base_word_state)
new_nodes_and_edges = graph.get_nodes_and_edges()
return json.dumps(new_nodes_and_edges)
if button_id == 'remove-word-button':
graph.set_nodes_and_edges(json.loads(nodes_and_edges))
if selected_nodes is not None:
......@@ -93,8 +100,14 @@ def update_graph(nodes_and_edges):
@app.callback(
Output(component_id='msx-graph', component_property='autoRefreshLayout'),
Input(component_id='add-word-button', component_property='n_clicks'),
Input(component_id='extend-graph-button', component_property='n_clicks'),
Input(component_id='remove-word-button', component_property='n_clicks'),
prevent_initial_call=True
)
def set_auto_refresh_layout(add_word_button, remove_word_button):
return False
def set_auto_refresh_layout(add_word_button, extend_graph_button, remove_word_button):
callback_context = dash.callback_context
button_id = callback_context.triggered[0]['prop_id'].split('.')[0]
if button_id == 'add-word-button' or button_id == 'remove-word-button':
return False
if button_id == 'extend-graph-button':
return True
......@@ -6,6 +6,8 @@ class Graph:
def __init__(self):
self.nodes = []
self.edges = []
self.COUNT_THRESHOLD = 2
self.MAX_NUM_WORDS = 10
def get_all_words(self):
all_words = [node_dict['data']['label'] for node_dict in self.nodes]
......@@ -99,3 +101,21 @@ class Graph:
}
]
)
def extend_graph(self, word2vec_model, base_node):
current_words = [node['data']['id'] for node in self.nodes]
all_associated_words = []
for current_word in current_words:
associated_words = word2vec_model.get_associated_words(current_word, top_n=100)
all_associated_words.extend(associated_words)
associated_words_filtered = [word for word in all_associated_words if word not in current_words]
associated_words_count = {word: associated_words_filtered.count(word) for word in list(set(associated_words_filtered))}
common_associated_words = [word for word, count in associated_words_count.items() if count >= self.COUNT_THRESHOLD]
while len(common_associated_words) > self.MAX_NUM_WORDS:
self.COUNT_THRESHOLD += 1
common_associated_words = [word for word, count in associated_words_count.items() if count >= self.COUNT_THRESHOLD]
self.add_nodes(common_associated_words)
self.add_edges(base_node, common_associated_words)
......@@ -44,7 +44,10 @@ layout = html.Div(children=[
html.Div(className='col-2', children=[
html.Button(id='add-word-button', n_clicks_timestamp=0, children='Add Association', className='btn btn-success btn-lg'),
]),
html.Div(className='col-4', children=[
html.Div(className='col-2', children=[
html.Button(id='extend-graph-button', n_clicks_timestamp=0, children='Extend Graph', className='btn btn-success btn-lg'),
]),
html.Div(className='col-2', children=[
]),
html.Div(className='col-3', children=[
html.Button(id='remove-word-button', n_clicks_timestamp=0, children='Remove Selected Association', className='btn btn-danger btn-lg')]),
......
......@@ -5,13 +5,12 @@ import stringdist as sdi
class AssociatedWords:
def __init__(self):
self.N_RESULTS = 10
print("\n Word2Vec model is loading.This can take a couple of minutes.")
self.model = api.load('glove-twitter-200')
print(" Word2Vec model is ready. Enjoy!!!\n")
def get_associated_words(self, word):
gensim_result = self.model.most_similar(word, topn=self.N_RESULTS)
def get_associated_words(self, word, top_n=10):
gensim_result = self.model.most_similar(word, topn=top_n)
# gensim_result = [('apple', 1.0), ('banana', 1.0), ('strawberry', 1.0)]
words = self.filter_results(gensim_result, word)
return words
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment