diff --git a/Julian_BM/w2v_stringdist_cyto.py b/Julian_BM/w2v_stringdist_cyto.py deleted file mode 100644 index a47cf598395b3d8df12da2d1aefaaba9cb4bb38f..0000000000000000000000000000000000000000 --- a/Julian_BM/w2v_stringdist_cyto.py +++ /dev/null @@ -1,137 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Staring date: 09-02-21 - -Code for MSX: Extension of candidate words. -Functions: - words_to_sets: Check existent word2vec model for words similarities. Also - prune the similar words by string distances. - Input: set of words - Output: dictionary with original + similar words + similarity scores - sets_to_network: Converts part of the output of words_to_sets function into 2 objects - to be loaded in dash-cytoscape. - Input: dictionary with original + similar words - Ouput: two arrays to be used in dash-cyto - -Codes include the use of an existent -word2vec pretrained model and string/substring distance. - -Author: Julian Bianco Martinez -""" -### Loading model -import gensim.downloader as api - -# Different Pre-trained Embeddings -#model = api.load('word2vec-google-news-300') -model = api.load('glove-twitter-200') -model.most_similar('people', topn=100) - -terms = """Jargon -Domain -Standardisation -Interactive tool -word2vec -NLP -Collaboration -Vocabulary -Graph -Meaning -Tool -Communication -People -Terms -Associations -Efficient""" -words_set = terms - -#%% Functions -def words_to_sets(words_set): - # Similar words - from tqdm import tqdm - import stringdist as sdi - terms = words_set.lower().split("\n") - terms = [word.replace(' ','_') for word in terms] - - similar_terms = dict() - words_not_vocabulary = [] - for term in tqdm(terms): - try: - temp_tuple = model.most_similar(term, topn=20) - r=-1 - for i,j in temp_tuple: - r += 1 - temp_tuple[r] = temp_tuple[r] + (sdi.rdlevenshtein_norm(term,i),) + (i.find(term),) - similar_terms[term] = temp_tuple - similar_terms[term] = [(term, score, string_dist, substring) for term, score, string_dist, substring in similar_terms[term] if term == term.lower() - and "www" not in term - and string_dist > 0.5 - and substring == -1][:10] - except (KeyError) as e: - words_not_vocabulary.append(term) - return similar_terms, words_not_vocabulary - - -def sets_to_network(similar_terms): - ### CytoScape structure Creation - edges = [] - nodes = [] - for item in similar_terms: - if item not in nodes: - nodes.append({"data": {"id": item, "label": item},"classes" : 'followerNode'}) - for i in similar_terms[item]: - if i not in nodes: - nodes.append({"data": {"id": i[0], "label": i[0]},"classes" : 'followingNode'}) - temp = {'data':{ - 'source': item, - 'target': i[0] - } - } - edges.append(temp) - - return nodes, edges - - -similar_terms, words_not_vocabulary = words_to_sets(terms) -nodes, edges = sets_to_network(similar_terms) - -### END Functions - -#%% Dash -import dash -import dash_cytoscape as cyto -import dash_html_components as html - -#####Cyto Code - -app = dash.Dash(__name__) - -# define layout -default_stylesheet = [ - { - "selector": 'node', - "style":{ - "label": "data(label)" - } - }, - { - 'selector': '.followerNode', - 'style': { - 'background-color': '#00AAD9' - } - } -] - -app.layout = html.Div([ - cyto.Cytoscape( - id='test', - #style={'width': '100%', 'height': '400px'}, - layout = {'name': 'cose'}, - elements = edges + nodes, - stylesheet=default_stylesheet - ) -]) - -if __name__ == '__main__': - app.run_server(debug=False) - - \ No newline at end of file diff --git a/dash_app/callbacks.py b/dash_app/callbacks.py index 03e0816b892248936e7d834c25459eeec8b12ea8..e65ca22252a3e2ea5e1da0d41317844f74b28262 100644 --- a/dash_app/callbacks.py +++ b/dash_app/callbacks.py @@ -34,6 +34,7 @@ def update_base_word(submit_word_button, base_word_input): Output(component_id='graph-elements-div', component_property='children'), Input(component_id='submit-word-button', component_property='n_clicks'), Input(component_id='add-word-button', component_property='n_clicks'), + Input(component_id='extend-graph-button', component_property='n_clicks'), Input(component_id='remove-word-button', component_property='n_clicks'), State(component_id='base-word-input', component_property='value'), State(component_id='add-word-input', component_property='value'), @@ -42,8 +43,8 @@ def update_base_word(submit_word_button, base_word_input): State(component_id='base-word-div', component_property='children'), prevent_initial_call=True ) -def update_graph_elements(submit_word_button, add_word_button, remove_word_button, base_word_input, add_word_input, - nodes_and_edges, selected_nodes, base_word_state): +def update_graph_elements(submit_word_button, add_word_button, extend_graph_button, remove_word_button, base_word_input, + add_word_input, nodes_and_edges, selected_nodes, base_word_state): callback_context = dash.callback_context button_id = callback_context.triggered[0]['prop_id'].split('.')[0] @@ -65,6 +66,12 @@ def update_graph_elements(submit_word_button, add_word_button, remove_word_butto else: raise PreventUpdate + if button_id == 'extend-graph-button': + graph.set_nodes_and_edges(json.loads(nodes_and_edges)) + graph.extend_graph(word2vec_model, base_word_state) + new_nodes_and_edges = graph.get_nodes_and_edges() + return json.dumps(new_nodes_and_edges) + if button_id == 'remove-word-button': graph.set_nodes_and_edges(json.loads(nodes_and_edges)) if selected_nodes is not None: @@ -93,8 +100,14 @@ def update_graph(nodes_and_edges): @app.callback( Output(component_id='msx-graph', component_property='autoRefreshLayout'), Input(component_id='add-word-button', component_property='n_clicks'), + Input(component_id='extend-graph-button', component_property='n_clicks'), Input(component_id='remove-word-button', component_property='n_clicks'), prevent_initial_call=True ) -def set_auto_refresh_layout(add_word_button, remove_word_button): - return False +def set_auto_refresh_layout(add_word_button, extend_graph_button, remove_word_button): + callback_context = dash.callback_context + button_id = callback_context.triggered[0]['prop_id'].split('.')[0] + if button_id == 'add-word-button' or button_id == 'remove-word-button': + return False + if button_id == 'extend-graph-button': + return True diff --git a/dash_app/graph.py b/dash_app/graph.py index 22e87210b49a242ed6385cf629031f17ed8cf9b1..ab42631b49b594a7809ac56eaa2ff4720ed81d7e 100644 --- a/dash_app/graph.py +++ b/dash_app/graph.py @@ -6,6 +6,8 @@ class Graph: def __init__(self): self.nodes = [] self.edges = [] + self.COUNT_THRESHOLD = 2 + self.MAX_NUM_WORDS = 10 def get_all_words(self): all_words = [node_dict['data']['label'] for node_dict in self.nodes] @@ -99,3 +101,21 @@ class Graph: } ] ) + + def extend_graph(self, word2vec_model, base_node): + current_words = [node['data']['id'] for node in self.nodes] + all_associated_words = [] + for current_word in current_words: + associated_words = word2vec_model.get_associated_words(current_word, top_n=100) + all_associated_words.extend(associated_words) + + associated_words_filtered = [word for word in all_associated_words if word not in current_words] + associated_words_count = {word: associated_words_filtered.count(word) for word in list(set(associated_words_filtered))} + + common_associated_words = [word for word, count in associated_words_count.items() if count >= self.COUNT_THRESHOLD] + while len(common_associated_words) > self.MAX_NUM_WORDS: + self.COUNT_THRESHOLD += 1 + common_associated_words = [word for word, count in associated_words_count.items() if count >= self.COUNT_THRESHOLD] + + self.add_nodes(common_associated_words) + self.add_edges(base_node, common_associated_words) diff --git a/dash_app/layout.py b/dash_app/layout.py index 9ada1937104271d5a5abd4c7e4e1125b9b8238db..be4595620b5fda8d1d1daa58c1220da1e19c8d5e 100644 --- a/dash_app/layout.py +++ b/dash_app/layout.py @@ -44,7 +44,10 @@ layout = html.Div(children=[ html.Div(className='col-2', children=[ html.Button(id='add-word-button', n_clicks_timestamp=0, children='Add Association', className='btn btn-success btn-lg'), ]), - html.Div(className='col-4', children=[ + html.Div(className='col-2', children=[ + html.Button(id='extend-graph-button', n_clicks_timestamp=0, children='Extend Graph', className='btn btn-success btn-lg'), + ]), + html.Div(className='col-2', children=[ ]), html.Div(className='col-3', children=[ html.Button(id='remove-word-button', n_clicks_timestamp=0, children='Remove Selected Association', className='btn btn-danger btn-lg')]), diff --git a/dash_app/words.py b/dash_app/words.py index b63ecc60edf3316cc896f769376589f908a7da4b..308a0086f108cb4fb48f3a51083bf6b617f7a0a1 100644 --- a/dash_app/words.py +++ b/dash_app/words.py @@ -5,13 +5,12 @@ import stringdist as sdi class AssociatedWords: def __init__(self): - self.N_RESULTS = 10 print("\n Word2Vec model is loading.This can take a couple of minutes.") self.model = api.load('glove-twitter-200') print(" Word2Vec model is ready. Enjoy!!!\n") - def get_associated_words(self, word): - gensim_result = self.model.most_similar(word, topn=self.N_RESULTS) + def get_associated_words(self, word, top_n=10): + gensim_result = self.model.most_similar(word, topn=top_n) # gensim_result = [('apple', 1.0), ('banana', 1.0), ('strawberry', 1.0)] words = self.filter_results(gensim_result, word) return words