Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
FoodInformatics
msx-tool
Commits
4c49b267
Commit
4c49b267
authored
Apr 14, 2021
by
Jim Hoekstra
👋🏻
Browse files
Merge branch 'develop' into 'master'
Merge develop into master See merge request
!13
parents
beeb826a
c921e7ad
Changes
7
Hide whitespace changes
Inline
Side-by-side
JBM_Intersection/Graph_Expantion_by_Intersection.py
deleted
100644 → 0
View file @
beeb826a
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 31 12:09:38 2021
@author: Julian Bianco-Martinez
Graph Expansion by Intersection of Concepts (GEIC)
"""
import
pandas
as
pd
import
numpy
as
np
def
concepts_extraction_list
(
lst1
):
'''
Extract concepts from word2vec output
'''
lst3
=
[
value
[
0
]
for
value
in
lst1
]
return
lst3
def
GEIC
(
concepts
,
words_to_exclude
=
[],
topX_intersect_concepts
=
5
,
topXsimilarConcepts
=
100
,
threshold
=
0.5
):
'''
From a set of concepts = C0, this algorithm collects and weight similar concepts that intersect with concepts in C0
Inputs:
concepts: Original Concepts (C0)
words_to_exclude: The code will remove new concepts that are duplicated in this list. If it is empty words_to_exclude = concepts
topX_intersect_concepts: Only collect the top X concepts found from the intersection of similar concepts between pairs of original concepts C0. ie.
C0(i) int C0(j) = IC(i,j)[::topX_intersect_concepts]
topXsimilarConcepts: Retrieve top X similar concepts per concept in C0
threshold: cut of threshold that retrieves only intersected concepts that apear in theshold * 100 percent of all combine C0 pairs.
Output:
Data frame with 4 columns:
Column 1: C0(i)
Column 2: C0(j)
Column 3: IC(i,j)[::topX_intersect_concepts]
Column 4: Importance of the intersected concept defined as the percentage of time the concept appear in pairwise combination of C0 concepts.
'''
if
len
(
words_to_exclude
)
==
0
:
# words_to_exclude = concepts
words_to_exclude
=
concepts
.
copy
()
df_temp
=
pd
.
DataFrame
({
'Concept 1'
:
[],
'Concept 2'
:[],
'Intersection'
:
[]})
# Creation of triangular data (due to symmetry).
for
i
in
range
(
len
(
concepts
)
-
1
):
for
j
in
range
(
i
+
1
,
len
(
concepts
)):
concepts1
=
concepts_extraction_list
(
model
.
most_similar
(
concepts
[
i
],
topn
=
topXsimilarConcepts
))
concepts2
=
concepts_extraction_list
(
model
.
most_similar
(
concepts
[
j
],
topn
=
topXsimilarConcepts
))
inter
=
[
v
for
v
in
concepts1
if
v
in
concepts2
][
0
:
topX_intersect_concepts
]
if
len
(
inter
)
>
0
:
df_temp
=
df_temp
.
append
(
pd
.
DataFrame
({
'Concept 1'
:
concepts
[
i
],
'Concept 2'
:
concepts
[
j
],
'Intersection'
:
inter
}))
df_extension
=
df_temp
#[v for v in df_extention['Intersect'].values if v not in words_to_exclude]
#Remove words that contain less than 4 characters.
logical_temp
=
[
True
if
len
(
v
)
>
3
else
False
for
v
in
df_extension
[
'Intersection'
].
values
]
df_extension
=
df_extension
[
logical_temp
]
#Weight Creation
weights
=
df_temp
[
'Intersection'
].
value_counts
().
rename_axis
([
'Intersection'
]).
reset_index
(
name
=
'weight'
)
weights
[
'weight'
]
=
weights
[
'weight'
]
/
(
0.5
*
np
.
math
.
factorial
(
len
(
concepts
))
/
(
np
.
math
.
factorial
(
len
(
concepts
)
-
2
)
))
#Use for normalization.
logical_temp
=
[
True
if
v
not
in
words_to_exclude
else
False
for
v
in
df_extension
[
'Intersection'
].
values
]
df_extension
=
df_extension
[
logical_temp
]
df_extension
=
df_extension
.
merge
(
weights
,
on
=
"Intersection"
)
df_extension
=
df_extension
.
loc
[
df_extension
[
'weight'
]
>
threshold
]
return
(
df_extension
)
#Example
concepts
=
[
'king'
,
'queen'
,
'prince'
]
GEIC
(
concepts
,
topX_intersect_concepts
=
15
)
Julian_BM/w2v_stringdist_cyto.py
deleted
100644 → 0
View file @
beeb826a
# -*- coding: utf-8 -*-
"""
Staring date: 09-02-21
Code for MSX: Extension of candidate words.
Functions:
words_to_sets: Check existent word2vec model for words similarities. Also
prune the similar words by string distances.
Input: set of words
Output: dictionary with original + similar words + similarity scores
sets_to_network: Converts part of the output of words_to_sets function into 2 objects
to be loaded in dash-cytoscape.
Input: dictionary with original + similar words
Ouput: two arrays to be used in dash-cyto
Codes include the use of an existent
word2vec pretrained model and string/substring distance.
Author: Julian Bianco Martinez
"""
### Loading model
import
gensim.downloader
as
api
# Different Pre-trained Embeddings
#model = api.load('word2vec-google-news-300')
model
=
api
.
load
(
'glove-twitter-200'
)
model
.
most_similar
(
'people'
,
topn
=
100
)
terms
=
"""Jargon
Domain
Standardisation
Interactive tool
word2vec
NLP
Collaboration
Vocabulary
Graph
Meaning
Tool
Communication
People
Terms
Associations
Efficient"""
words_set
=
terms
#%% Functions
def
words_to_sets
(
words_set
):
# Similar words
from
tqdm
import
tqdm
import
stringdist
as
sdi
terms
=
words_set
.
lower
().
split
(
"
\n
"
)
terms
=
[
word
.
replace
(
' '
,
'_'
)
for
word
in
terms
]
similar_terms
=
dict
()
words_not_vocabulary
=
[]
for
term
in
tqdm
(
terms
):
try
:
temp_tuple
=
model
.
most_similar
(
term
,
topn
=
20
)
r
=-
1
for
i
,
j
in
temp_tuple
:
r
+=
1
temp_tuple
[
r
]
=
temp_tuple
[
r
]
+
(
sdi
.
rdlevenshtein_norm
(
term
,
i
),)
+
(
i
.
find
(
term
),)
similar_terms
[
term
]
=
temp_tuple
similar_terms
[
term
]
=
[(
term
,
score
,
string_dist
,
substring
)
for
term
,
score
,
string_dist
,
substring
in
similar_terms
[
term
]
if
term
==
term
.
lower
()
and
"www"
not
in
term
and
string_dist
>
0.5
and
substring
==
-
1
][:
10
]
except
(
KeyError
)
as
e
:
words_not_vocabulary
.
append
(
term
)
return
similar_terms
,
words_not_vocabulary
def
sets_to_network
(
similar_terms
):
### CytoScape structure Creation
edges
=
[]
nodes
=
[]
for
item
in
similar_terms
:
if
item
not
in
nodes
:
nodes
.
append
({
"data"
:
{
"id"
:
item
,
"label"
:
item
},
"classes"
:
'followerNode'
})
for
i
in
similar_terms
[
item
]:
if
i
not
in
nodes
:
nodes
.
append
({
"data"
:
{
"id"
:
i
[
0
],
"label"
:
i
[
0
]},
"classes"
:
'followingNode'
})
temp
=
{
'data'
:{
'source'
:
item
,
'target'
:
i
[
0
]
}
}
edges
.
append
(
temp
)
return
nodes
,
edges
similar_terms
,
words_not_vocabulary
=
words_to_sets
(
terms
)
nodes
,
edges
=
sets_to_network
(
similar_terms
)
### END Functions
#%% Dash
import
dash
import
dash_cytoscape
as
cyto
import
dash_html_components
as
html
#####Cyto Code
app
=
dash
.
Dash
(
__name__
)
# define layout
default_stylesheet
=
[
{
"selector"
:
'node'
,
"style"
:{
"label"
:
"data(label)"
}
},
{
'selector'
:
'.followerNode'
,
'style'
:
{
'background-color'
:
'#00AAD9'
}
}
]
app
.
layout
=
html
.
Div
([
cyto
.
Cytoscape
(
id
=
'test'
,
#style={'width': '100%', 'height': '400px'},
layout
=
{
'name'
:
'cose'
},
elements
=
edges
+
nodes
,
stylesheet
=
default_stylesheet
)
])
if
__name__
==
'__main__'
:
app
.
run_server
(
debug
=
False
)
\ No newline at end of file
dash_app/app.py
View file @
4c49b267
import
dash
import
os
from
dash_app.layout
import
external_stylesheets
app
=
dash
.
Dash
(
name
=
__name__
,
external_stylesheets
=
external_stylesheets
,
url_base_pathname
=
'/msx/'
,
URL_PREFIX
=
'/msx/'
if
os
.
getenv
(
'MSX_URL_PREFIX'
,
False
):
URL_PREFIX
=
os
.
environ
[
'MSX_URL_PREFIX'
]
app
=
dash
.
Dash
(
name
=
__name__
,
external_stylesheets
=
external_stylesheets
,
url_base_pathname
=
URL_PREFIX
,
suppress_callback_exceptions
=
True
)
dash_app/callbacks.py
View file @
4c49b267
...
...
@@ -34,16 +34,17 @@ def update_base_word(submit_word_button, base_word_input):
Output
(
component_id
=
'graph-elements-div'
,
component_property
=
'children'
),
Input
(
component_id
=
'submit-word-button'
,
component_property
=
'n_clicks'
),
Input
(
component_id
=
'add-word-button'
,
component_property
=
'n_clicks'
),
Input
(
component_id
=
'extend-graph-button'
,
component_property
=
'n_clicks'
),
Input
(
component_id
=
'remove-word-button'
,
component_property
=
'n_clicks'
),
State
(
component_id
=
'base-word-input'
,
component_property
=
'value'
),
State
(
component_id
=
'add-word-input'
,
component_property
=
'value'
),
State
(
component_id
=
'graph-elements-div'
,
component_property
=
'children'
),
State
(
component_id
=
'msx-graph'
,
component_property
=
'
tap
NodeData'
),
State
(
component_id
=
'msx-graph'
,
component_property
=
'
selected
NodeData'
),
State
(
component_id
=
'base-word-div'
,
component_property
=
'children'
),
prevent_initial_call
=
True
)
def
update_graph_elements
(
submit_word_button
,
add_word_button
,
remove_word_button
,
base_word_input
,
add_word_input
,
nodes_and_edges
,
selected_nodes
,
base_word_state
):
def
update_graph_elements
(
submit_word_button
,
add_word_button
,
extend_graph_button
,
remove_word_button
,
base_word_input
,
add_word_input
,
nodes_and_edges
,
selected_nodes
,
base_word_state
):
callback_context
=
dash
.
callback_context
button_id
=
callback_context
.
triggered
[
0
][
'prop_id'
].
split
(
'.'
)[
0
]
...
...
@@ -65,16 +66,24 @@ def update_graph_elements(submit_word_button, add_word_button, remove_word_butto
else
:
raise
PreventUpdate
if
button_id
==
'extend-graph-button'
:
graph
.
set_nodes_and_edges
(
json
.
loads
(
nodes_and_edges
))
graph
.
extend_graph
(
word2vec_model
,
base_word_state
)
new_nodes_and_edges
=
graph
.
get_nodes_and_edges
()
return
json
.
dumps
(
new_nodes_and_edges
)
if
button_id
==
'remove-word-button'
:
graph
.
set_nodes_and_edges
(
json
.
loads
(
nodes_and_edges
))
if
selected_nodes
is
not
None
:
selected_word
=
selected_nodes
[
'label'
]
if
selected_word
in
graph
.
get_all_words
()
and
selected_word
!=
base_word_state
:
graph
.
remove_node
(
selected_nodes
[
'label'
])
new_nodes_and_edges
=
json
.
dumps
(
graph
.
get_nodes_and_edges
())
return
new_nodes_and_edges
else
:
raise
PreventUpdate
if
len
(
selected_nodes
)
>
0
:
for
selected_node
in
selected_nodes
:
selected_word
=
selected_node
[
'id'
]
if
selected_word
in
graph
.
get_all_words
()
and
selected_word
!=
base_word_state
:
graph
.
remove_node
(
selected_word
)
else
:
raise
PreventUpdate
new_nodes_and_edges
=
json
.
dumps
(
graph
.
get_nodes_and_edges
())
return
new_nodes_and_edges
else
:
raise
PreventUpdate
...
...
@@ -93,8 +102,14 @@ def update_graph(nodes_and_edges):
@
app
.
callback
(
Output
(
component_id
=
'msx-graph'
,
component_property
=
'autoRefreshLayout'
),
Input
(
component_id
=
'add-word-button'
,
component_property
=
'n_clicks'
),
Input
(
component_id
=
'extend-graph-button'
,
component_property
=
'n_clicks'
),
Input
(
component_id
=
'remove-word-button'
,
component_property
=
'n_clicks'
),
prevent_initial_call
=
True
)
def
set_auto_refresh_layout
(
add_word_button
,
remove_word_button
):
return
False
def
set_auto_refresh_layout
(
add_word_button
,
extend_graph_button
,
remove_word_button
):
callback_context
=
dash
.
callback_context
button_id
=
callback_context
.
triggered
[
0
][
'prop_id'
].
split
(
'.'
)[
0
]
if
button_id
==
'add-word-button'
or
button_id
==
'remove-word-button'
:
return
False
if
button_id
==
'extend-graph-button'
:
return
True
dash_app/graph.py
View file @
4c49b267
...
...
@@ -6,6 +6,8 @@ class Graph:
def
__init__
(
self
):
self
.
nodes
=
[]
self
.
edges
=
[]
self
.
COUNT_THRESHOLD
=
2
self
.
MAX_NUM_WORDS
=
10
def
get_all_words
(
self
):
all_words
=
[
node_dict
[
'data'
][
'label'
]
for
node_dict
in
self
.
nodes
]
...
...
@@ -99,3 +101,22 @@ class Graph:
}
]
)
def
extend_graph
(
self
,
word2vec_model
,
base_node
):
current_words
=
[
node
[
'data'
][
'id'
]
for
node
in
self
.
nodes
]
all_associated_words
=
[]
for
current_word
in
current_words
:
associated_words
=
word2vec_model
.
get_associated_words
(
current_word
,
top_n
=
100
)
all_associated_words
.
extend
(
associated_words
)
associated_words_filtered
=
[
word
for
word
in
all_associated_words
if
word
not
in
current_words
]
associated_words_count
=
{
word
:
associated_words_filtered
.
count
(
word
)
for
word
in
list
(
set
(
associated_words_filtered
))}
count_threshold
=
self
.
COUNT_THRESHOLD
common_associated_words
=
[
word
for
word
,
count
in
associated_words_count
.
items
()
if
count
>=
count_threshold
]
while
len
(
common_associated_words
)
>
self
.
MAX_NUM_WORDS
:
count_threshold
+=
1
common_associated_words
=
[
word
for
word
,
count
in
associated_words_count
.
items
()
if
count
>=
count_threshold
]
self
.
add_nodes
(
common_associated_words
)
self
.
add_edges
(
base_node
,
common_associated_words
)
dash_app/layout.py
View file @
4c49b267
...
...
@@ -44,7 +44,10 @@ layout = html.Div(children=[
html
.
Div
(
className
=
'col-2'
,
children
=
[
html
.
Button
(
id
=
'add-word-button'
,
n_clicks_timestamp
=
0
,
children
=
'Add Association'
,
className
=
'btn btn-success btn-lg'
),
]),
html
.
Div
(
className
=
'col-4'
,
children
=
[
html
.
Div
(
className
=
'col-2'
,
children
=
[
html
.
Button
(
id
=
'extend-graph-button'
,
n_clicks_timestamp
=
0
,
children
=
'Extend Graph'
,
className
=
'btn btn-success btn-lg'
),
]),
html
.
Div
(
className
=
'col-2'
,
children
=
[
]),
html
.
Div
(
className
=
'col-3'
,
children
=
[
html
.
Button
(
id
=
'remove-word-button'
,
n_clicks_timestamp
=
0
,
children
=
'Remove Selected Association'
,
className
=
'btn btn-danger btn-lg'
)]),
...
...
dash_app/words.py
View file @
4c49b267
...
...
@@ -5,13 +5,12 @@ import stringdist as sdi
class
AssociatedWords
:
def
__init__
(
self
):
self
.
N_RESULTS
=
10
print
(
"
\n
Word2Vec model is loading.This can take a couple of minutes."
)
print
(
"
\n
Word2Vec model is loading. This can take a couple of minutes."
)
self
.
model
=
api
.
load
(
'glove-twitter-200'
)
print
(
" Word2Vec model is ready. Enjoy!!!
\n
"
)
def
get_associated_words
(
self
,
word
):
gensim_result
=
self
.
model
.
most_similar
(
word
,
topn
=
self
.
N_RESULTS
)
def
get_associated_words
(
self
,
word
,
top_n
=
10
):
gensim_result
=
self
.
model
.
most_similar
(
word
,
topn
=
top_n
)
# gensim_result = [('apple', 1.0), ('banana', 1.0), ('strawberry', 1.0)]
words
=
self
.
filter_results
(
gensim_result
,
word
)
return
words
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment