Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions randomwalks_utility/create_corpora_from_random_walks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pickle
import codecs

from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

Expand Down Expand Up @@ -46,7 +47,7 @@ def chunks(lst, n):
for i in range(0, len(lst), n):
yield lst[i:i + n]

def generate_corpus_from_walks(walks, output_path_prefix="./randomwalks/rw_corpus_", output_path_suffix=""):
def generate_corpus_from_walks(walks, output_path_prefix="./data/rw_corpus_", output_path_suffix=""):
# how do we actually want to generate the corpus?
# one option is to always dublicate the node in the middle..
# also Goran says that we want to keep the relations as separate tokens in the vocab. I do not necessarily agree with this, but we try.
Expand Down Expand Up @@ -75,13 +76,22 @@ def generate_corpus_from_walks(walks, output_path_prefix="./randomwalks/rw_corpu
with codecs.open(output_path, "w", "utf8") as out:
out.write(text)

def analyze_walks(path="./output/random_walk"):
walk = load_walks(path)
total_num_walks = len(walk)



def main():
in_prefix = "randomwalks/random_walk_"
in_suffix = "1.0_1.0_2_15"
walks = load_walks(in_prefix + in_suffix + ".p")
generate_corpus_from_walks(walks, output_path_suffix=in_suffix + "_nl")
for in_suffix in ["1.0_1.0_5_20","1.0_1.0_5_30"]:
in_prefix = "./data/random_walk_full_assertions_"
#in_suffix = "1.0_1.0_5_10"
walks = load_walks(in_prefix + in_suffix + ".p")
generate_corpus_from_walks(walks, output_path_suffix=in_suffix + "full_assertions_nl")


if __name__=="__main__":
main()



89 changes: 66 additions & 23 deletions randomwalks_utility/preprocess_cn.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,78 @@
import codecs

"""
As we got the relations to consider from olga, we don't need to do this anymore
"""
# def filter_assertions(path="./relations/assertions.csv"):
# assertions = []
# with codecs.open(path, "r", "utf8") as f:
# reader = csv.DictReader(f, dialect=csv.excel_tab, fieldnames=["URI", "relation", "node_a", "node_b", "info"])
# for i,row in enumerate(reader):
# node_a = row["node_a"].split("/c/en/")
# node_b = row["node_b"].split("/c/en/")
# if len(node_a) > 1 and len(node_b) > 1:
# # these should be nodes in english
# node_a = node_a[1].split("/")[-1].replace("_", "-")
# node_b = node_b[1].split("/")[-1].replace("_", "-")
# print(node_a)
# print(node_b)

"""
Based on the relations from olga
"""
def create_joined_assertions_for_random_walks(paths=["./relations/cn_antonyms.txt", "./relations/cn_isA.txt", "./relations/cn_mannerOf.txt","./relations/cn_synonyms.txt"], output_path="./randomwalks/cn_assertions_filtered.tsv"):
def create_joined_assertions_for_random_walks(paths=["./data/cn_antonyms.txt",
"./data/cn_isA.txt",
"./data/cn_mannerOf.txt",
"./data/cn_synonyms.txt",
"./data/cn_atLocation.txt",
"./data/cn_capableOf.txt",
"./data/cn_causes.txt",
"./data/cn_causesDesire.txt",
"./data/cn_createdBy.txt",
"./data/cn_definedAs.txt",
"./data/cn_derivedFrom.txt",
"./data/cn_desires.txt",
"./data/cn_distinctFrom.txt",
"./data/cn_hasA.txt",
"./data/cn_hasContext.txt",
"./data/cn_hasFirstSubevent.txt",
"./data/cn_hasLastSubevent.txt",
"./data/cn_hasSubevent.txt",
"./data/cn_hasPrerequisite.txt",
"./data/cn_hasProperty.txt",
"./data/cn_locatedNear.txt",
"./data/cn_madeOf.txt",
"./data/cn_motivatedByGoal.txt",
"./data/cn_obstructedBy.txt",
"./data/cn_partOf.txt",
"./data/cn_receivesAction.txt",
"./data/cn_relatedTo.txt",
"./data/cn_similarTo.txt",
"./data/cn_symbolOf.txt",
"./data/cn_usedFor.txt"],
output_path="./data/cn_assertions_full.tsv"):
"""
Transforms a series of relation input files to a joint file containing natural language assertations
:param paths: relation input paths
:param output_path: output paths
:return:
"""
# we ideally want to have a "natural language representation" of the relations
# TODO: keep in mind that antonymy and synonymy are bidirectional relationships, so maybe we want to account for this, i.e., by creating the corresponding pairs in the opposite direction or so
# TODO: As an alternative of random walks, we can also just use the natural language representation of the relationships
# TODO: For camera-ready version: Run everything again with this improved mapping
relation_dict = {
"antonyms": "is an antonym of",
"isA": "is a",
"mannerOf": "is a manner of",
"synonyms": "is a synonym of"
"synonyms": "is a synonym of",
"atLocation": "is at",
"capableOf": "is capable of",
"causes": "causes",
"causesDesire": "causes the desire to",
"createdBy": "is created by",
"definedAs": "is defined as",
"derivedFrom": "is derived from",
"desires": "desires",
"distinctFrom": "is distinct from",
"hasA": "has a",
"hasContext": "is used in the context of",
"hasFirstSubevent": "begins with",
"hasLastSubevent": "concludes with",
"hasSubevent": "has as subevent",
"hasPrerequisite": "is dependent on",
"hasProperty": "can be described as",
"locatedNear": "is located near",
"madeOf": "is made of",
"motivatedByGoal": "is motivated by",
"obstructedBy": "is obstructed by",
"partOf": "is part of",
"receivesAction": "receives as action",
"relatedTo": "is related to",
"similarTo": "is similar to",
"symbolOf": "is a symbol of",
"usedFor": "is used for",
}
all_assertions = []
for path in paths:
Expand All @@ -53,8 +97,7 @@ def create_joined_assertions_for_random_walks(paths=["./relations/cn_antonyms.tx

def main():
create_joined_assertions_for_random_walks()
#profile_data()
#filter_assertions()

if __name__ == "__main__":
main()

29 changes: 15 additions & 14 deletions randomwalks_utility/random_walks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import random
import pickle

def read_graph(path="./randomwalks/cn_assertions_filtered.tsv"):
def read_graph(path="./data/cn_assertions_filtered.tsv"):
'''
Reads the input network in networkx.
'''
Expand Down Expand Up @@ -195,21 +195,22 @@ def generate_random_walks_from_assertions():
p = 1.0 # return hyperparameter
q = 1.0 # inout hyperparameter
is_directed = True # whether the graph is directed
num_walks = 2 # number of wandom walks per source def. 10
walk_length = 15 # length of walk per source def. 80
num_walks = 5 # number of wandom walks per source def. 10
for walk_length in [20, 30]:
#walk_length = 20 # length of walk per source def. 80

nx_G = read_graph(path="./randomwalks/cn_assertions_filtered.tsv")
G = Graph(nx_G, is_directed, p, q)
G.preprocess_transition_probs()
walks = G.simulate_walks(num_walks, walk_length)
filename = "./randomwalks/random_walk_" + str(p) + "_" + str(q) + "_" + str(num_walks) + "_" + str(walk_length) + ".p"
with open(filename, 'wb') as handle:
pickle.dump(walks, handle)
print(len(walks))
nx_G = read_graph(path="./data/cn_assertions_full.tsv")
G = Graph(nx_G, is_directed, p, q)
G.preprocess_transition_probs()
walks = G.simulate_walks(num_walks, walk_length)
filename = "./data/random_walk_full_assertions_" + str(p) + "_" + str(q) + "_" + str(num_walks) + "_" + str(walk_length) + ".p"
with open(filename, 'wb') as handle:
pickle.dump(walks, handle)
print(len(walks))


def analyze_graph():
nx_G = read_graph(path="./randomwalks/cn_assertions_filtered.tsv")
nx_G = read_graph(path="./data/cn_assertions_full.tsv")
print("%d nodes in the graph" % nx_G.number_of_nodes())
print("%d edges in the graph" % nx_G.number_of_edges())
print("%f density of graph" % nx.density(nx_G))
Expand Down Expand Up @@ -250,7 +251,7 @@ def load_random_walk(p):
def main():
generate_random_walks_from_assertions()
#analyze_graph()
# load_random_walk(p="./randomwalks/random_walk_1.0_1.0_2_10.p")
#load_random_walk(p="./data/random_walk_1.0_1.0_2_10.p")

if __name__=="__main__":
main()
main()