From c51359fbf72f37df0a4d9213b27c45069afa285b Mon Sep 17 00:00:00 2001 From: Sam Hardwick Date: Tue, 7 Feb 2023 08:58:25 +0200 Subject: [PATCH] Initial version of multi-step language-based parallelisation exercise --- _hands-on/parallelise/README.md | 4 + _hands-on/parallelise/lda.md | 293 ++++++++++++++++++++ _hands-on/parallelise/parse_vrt.py | 69 +++++ _hands-on/parallelise/parse_vrt_solution.py | 71 +++++ _hands-on/parallelise/requirements.txt | 2 + _hands-on/parallelise/stopwords.txt | 50 ++++ _hands-on/parallelise/topics.py | 58 ++++ _hands-on/parallelise/topics_solution.py | 59 ++++ 8 files changed, 606 insertions(+) create mode 100644 _hands-on/parallelise/README.md create mode 100644 _hands-on/parallelise/lda.md create mode 100644 _hands-on/parallelise/parse_vrt.py create mode 100644 _hands-on/parallelise/parse_vrt_solution.py create mode 100644 _hands-on/parallelise/requirements.txt create mode 100644 _hands-on/parallelise/stopwords.txt create mode 100644 _hands-on/parallelise/topics.py create mode 100644 _hands-on/parallelise/topics_solution.py diff --git a/_hands-on/parallelise/README.md b/_hands-on/parallelise/README.md new file mode 100644 index 00000000..f443a577 --- /dev/null +++ b/_hands-on/parallelise/README.md @@ -0,0 +1,4 @@ +# Parallelise + +## Exercises +* [Fetch text and do topic modeling](lda.md) - this exercise involves installing prerequisites and four instances of modifying non-parallel code to be parallel. diff --git a/_hands-on/parallelise/lda.md b/_hands-on/parallelise/lda.md new file mode 100644 index 00000000..ef20382b --- /dev/null +++ b/_hands-on/parallelise/lda.md @@ -0,0 +1,293 @@ +--- +topic: installing +title: Exercise - Fetch text in VRT format and do topic modeling +--- + + + + + + + +# Exercise - Fetch text in VRT format and do topic modeling + +💬 In this exercise we will experiment with a common task for text, topic modeling. You can go through the tutorial step by step and do all the exercises, or just read it. The exercises are in the form of Python code that you can edit to make it run faster. Solutions are included. + +## A processing node and workspaces + +☝🏻 First, hop into an interacive computing node with `sinteractive --time 08:00:00 --mem 32000 --cores 4`, it will prompt you to select a CSC project. `--time 08:00:00` means that the node will kick you out after 8 hours. If you exit the node before then, you will save on billing units; the reservation is more for scheduling than billing purposes. `--mem 32000` means 32000 megabytes, and `--cores 4` means you will be able to run that many processes simultaneously (for this small example). + +We will organise workspaces as follows: data archives, which we don't want to download many times, go to a non-temporary location, like `/scratch///`. The same with dependencies and code. But we'll unpack data into `$TMPDIR`. The reason for this is that `$TMPDIR` will be a disk local to the computing node, so it will be fast for reading and writing. In this particular case with just a few files it barely matters, but it's a good habit to learn. + +For dependencies and code, making a directory for this under `/scratch//` is a good choice, since we're just trying things out. You can make sure that this directory exists with `mkdir -p /scratch//$USER`. In that directory, fetch some starter code into a new directory with `wget https://a3s.fi/hardwick-clarin-pub/lda.zip; unzip lda.zip` (assuming you don't already have a directory called `lda`. Then `cd lda` into the directory. This will be our workspace. The Python scripts ending with `_solution.py` contain solutions to the exercises. + +## Dependencies + +☝🏻 We need to install some dependencies. This can be done in many ways, some simpler than others, and some more efficient than others. + +There are essentially three alternatives for installing Python dependencies: + +1) Installing them in your home directory with `pip install --user`. This quickly becomes unmaintainable with many projects and library versions. +2) Installing in a virtual environment with `venv` or `conda`. This has some downsides on the HPC systems, causing slow startup times and unnecessary IO load on the whole system. +3) An Apptainer container, for which we have our custom tool `tykky`, which is usually the ideal option. + +If you have a `requirements.txt` file, as we do here, installing them into a `tykky` environment is in principle simple, as long as your libraries support the default Python version, which at the time of writing is 3.6. Unfortunately, that's too old for us, so we'll first make a temporary `venv` in which to build the `tykky` container with python3.9. So we do: + +```bash +$ mkdir tykky-env # the tykky environment will go here +$ python3.9 -m venv tmp-venv # create a temporary venv with the correct Python version +$ source tmp-venv/bin/activate # step into the venv +$ module load tykky # load the tykky module +$ pip-containerize new --prefix /scratch//$USER/lda requirements.txt # or whatever directory you chose +$ deactivate # exit the temporary venv +$ rm -rf tmp-venv # not needed anymore +$ export PATH="/scratch//$USER/csc-training/lda/tykky-env/bin:$PATH" # make the tykky environment visible +``` + +For the rest of this session, your default Python environment will have the packages from `requirements.txt` installed. After logging out, things will be back to the way they were before. Then you can `export PATH` again, or set the path on every login in eg. `.bash_profile`. + +## Data + +The Language Bank of Finland keeps its analyzed text data in a format called [VRT](https://www.kielipankki.fi/development/korp/corpus-input-format/). VRT is used because it's the format of the [IMG Open Corpus Workbench](http://cwb.sourceforge.net/) (CWB), so it's not exactly a common standard, but it's easy enough to use for many purposes. We will fetch some VRT files, extract the lemmas, and use the lemmas as input to a topic modeling package. + +The Language Bank maintains a [directory of corpora](https://www.kielipankki.fi/corpora/) which you can browse for corpora available to you. Each corpus is listed with license information: PUB means available to everyone, ACA means available for users affiliated with an academic institution, RES means you have to apply for access. + +💡 If you are a member of the `kieli` group on `puhti`, you can find read-only VRT data under `/appl/data/kielipankki/`. Otherwise, you can follow download links from the corpus directory. + +The rest of this example will use the YLE news in Finnish corpus, which can be downloaded [here](https://korp.csc.fi/download/YLE/fi/2011-2018-s-vrt/). + +```bash +$ wget https://korp.csc.fi/download/YLE/fi/2019-2021-s-vrt/ylenews-fi-2019-2021-s-vrt.zip + +$ unzip ylenews-fi-2019-2021-s-vrt.zip -d $TMPDIR + creating: /local_scratch//ylenews-fi-2019-2021-s-vrt/ + ... +``` + +We should now have three VRT files under `ylenews-fi-2019-2021-s-vrt/vrt` of roughly two gigabytes each. + +### Data format + +💭 Let's take a quick look at the files so we have some idea of what we're dealing with: + +```bash +$ head $TMPDIR/ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2019_s.vrt + + + + + +Kun 1 kun kun C SUBCAT_CS|CASECHANGE_Up 2 mark |kun..kn.1| +käännyin 2 kääntyä kääntyä V PRS_Sg1|VOICE_Act|TENSE_Prt|MOOD_Ind 5 advcl |kääntyä..vb.1| +katsomaan 3 katsoa katsoa V NUM_Sg|CASE_Ill|VOICE_Act|INF_Inf3 2 xcomp |katsoa..vb.1| +, 4 , , Punct _ 2 punct |,..xx.1| +huomasin 5 huomata huomata V PRS_Sg1|VOICE_Act|TENSE_Prt|MOOD_Ind 0 ROOT |huomata..vb.1| +``` + +VRT is a pseudo-xml format. By pseudo I mean that it doesn't have a root node, but is instead a sequence of `text` elements. (There are some other differences but that's not important right now.) The leaf nodes which contain text (here, `sentence`), have one token per line, with fields separated by tabs. So it's a TSV (tab-separated values) format inside an XML-like format. The first line indicates what the fields mean; the first one is `word`, for word form, the second is `ref`, for token number, `lemma` for lemma and so on. + +💭 You may notice that the `text` element has some interesting attributes, like `departments`, `main_department` and `publisher`. Unfortunately the `main_department` is usually empty (the commands are `unix` tools available on every system): + +```bash +$ grep --only-matching 'main_department="[^"]*' $TMPDIR/ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2019_s.vrt | sed 's/main_department="//' | sort | uniq -c | sort -nr + 62104 + 319 Yle TV1 + 263 Klassinen + 184 Yleisradio + 164 Luonto + 160 Strömsö + 160 Kulttuuricocktail +... +``` + +The `publisher` never is: + +```bash +$ grep --only-matching 'publisher="[^"]*' $TMPDIR/ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2019_s.vrt | sed 's/publisher="//' | sort | uniq -c | sort -nr + 38110 Yle Uutiset + 14104 Yle Urheilu + 8882 Yle Uutiset - lyhyet + 1301 yle-aihe +... +``` +The attributes come from the data source, and there's no general rule as to what you can rely on. Clearly here `publisher` is somewhat meaningful and very reliable, `main_department` has more detail, but is very sparse (perhaps we could fill it in ourselves!). + +## Data processing + +💬 Moving on, we can try to run `parse_vrt.py`, which by default builds lists of lemmas of each text, and then does nothing with them. It should look something like this: + +```bash +$ python3 parse_vrt.py $TMPDIR/ylenews-fi-2019-2021-s-vrt/vrt +Running parse_vrt_in_dir... + Reading ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2019_s.vrt + Finished reading ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2019_s.vrt, 65811 texts and 25772447 tokens + Reading ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2020_s.vrt + Finished reading ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2020_s.vrt, 63004 texts and 27871609 tokens + Reading ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2021_s.vrt + Finished reading ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2021_s.vrt, 56543 texts and 25374938 tokens +...finished in 136.04 s +``` + +### First task + +☝🏻 Your first task, should you choose to accept it, is to replace the sequential processing of VRT files in `parse_vrt.py` with parallel processing, and then verify that you are able to accomplish this step faster with parallel than sequential processing. + +
Hint 1 + +There are several files to read and process, so you can process different files separately and combine the results afterwards. + +
+ +
Hint 2 + +The standard library module `multiprocessing` has helpful facilities for this, such as `multiprocessing.Pool`, which can be used to `map` inputs to outputs in parallel. + +
+ +
+ +One possible solution for this is included in `parse_vrt_solution.py`, or you can expand this line to see some code. + + +In `parse.vrt`: +```python + # Exercise 1: parallelise parsing the corpora + # Hint: you can use the Python standard library for this + retval = [] + for filename in os.listdir(dirname): + if not filename.endswith('.vrt'): + continue + retval += vrt2lemmalists(os.path.join(dirname, filename)) +``` + +Solution: +```python + # Exercise 1 solution (one possible one): we map each filename to a + # vrt2lemmalists call using multiprocessing.Pool + from multiprocessing import Pool + retval = [] + # First we get the valid file names + filenames = [os.path.join(dirname, filename) for filename in os.listdir(dirname) if filename.endswith('.vrt')] + # Then we initialize a Pool object + with Pool() as pool: # by default, processes = number of cores + for result in pool.map(vrt2lemmalists, filenames): + # We add the result lists together + retval += result +``` + +This should run in 50-60 seconds instead of 130-140 seconds. +
+ +## Topic modelling + +💬 Next we will use `gensim` to do some topic modeling. The Python script `topics.py` uses `parse_vrt.py` to get data, and processes it in various ways. Try running it with the same argument: + +```bash +$ python3 topics.py $TMPDIR/ylenews-fi-2019-2021-s-vrt/vrt +Running parse_vrt_in_dir... + Reading ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2019_s.vrt + Finished reading ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2019_s.vrt, 65811 texts and 25772447 tokens + Reading ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2020_s.vrt + Finished reading ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2020_s.vrt, 63004 texts and 27871609 tokens + Reading ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2021_s.vrt + Finished reading ylenews-fi-2019-2021-s-vrt/vrt/ylenews_fi_2021_s.vrt, 56543 texts and 25374938 tokens +...finished in 133.57 s +Building gensim dictionary... Done in 21.49 s +Computing BOW corpus... Done in 14.11 s +Computing LDA model... Done in 114.99 s +Computing model coherence... + Coherence with 10 topics was -1.8115932174225207 +Done in 1.53 s +[topic printout] +``` + +After the one step from the previous section, we have added three more sections. All of them can be parallelised, but not all of them offer the same potential. If you are interested in parallelising code, they are all interesting examples, but the most important practical skill is to recognise at this point that these steps represent 47%, 7%, 5% and 41% of the runtime respectively, so that is the ceiling to how much can be accomplished by speeding them up. It is also often the case that relatively fast tasks also have relatively little to gain from parallelisation. We will tackle them in reverse order, from most to least useful. + +### Second task + +☝🏻 Replace the LdaModel class with something else to accomplish the same result, but quicker. + +
Hint 1 + +Go to the `gensim` [API reference](https://radimrehurek.com/gensim/apiref.html) and search the page for "models.lda". +
+ +
Hint 2 + +Try the `gensim.models.LdaMulticore` class. +
+ +
Solution + +Change this: +```python +lda = gensim.models.LdaModel(bow_corpus, num_topics = n_topics) +``` +To this: +```python +lda = gensim.models.LdaMulticore(bow_corpus, num_topics = n_topics, workers = n_workers) +``` +
+ +### Third Task + +☝🏻 Parallelise computing the BOW corpus. This means replacing the texts (or in this case, lists of lemmas) with bag-of-words representations. Each text will undergo the same transformation, so this should be possible to parallelise. + +
Hint 1 + +You can use `multiprocessing.Pool.map` like in the first exercise, but in an even simpler way: the result is simply the map. +
+ +
Solution + +Change this: +```python +bow_corpus = [dictionary.doc2bow(text) for text in corpus_lemmalists] +``` +To this: +```python +with Pool(processes = n_workers) as pool: + bow_corpus = pool.map(dictionary.doc2bow, corpus_lemmalists) +``` + +This doesn't really save any processing time, due to communication overhead being similar to the processing time. You can experiment with values of `n_workers` and `chunk_size`, an argument to `map()`. +
+ +### Fourth task + +☝🏻 Parallelise computing the `gensim` dictionary. This exercise is the trickiest one, and least useful to implement. + +
Hint 1 + +The `Dictionary` object has a method .merge_with(other), which we can use to turn a collection of dictionaries into one. But to do this we also need to split the source data, which is a list, into sublists. +
+ +
Hint 2 + +You can make sublists with a generator comprehension like this: +```python +def split_list(l, n): + return (l[i:i+n] for i in range(0, len(l), n)) +``` +
+ +
Solution + +```python +def split_list(l, n): + return (l[i:i+n] for i in range(0, len(l), n)) + +dictionary = None +with Pool(processes = n_workers) as pool: + for sub_dictionary in pool.map(gensim.corpora.Dictionary, + split_list(corpus_lemmalists, 5000)): + if dictionary is None: + dictionary = sub_dictionary + else: + dictionary.merge_with(sub_dictionary) +``` +
+ +## Finally + +We have focused on getting dependencies installed on CSC's HPC systems and on parallelism, but of course there are more general things that could be done to speed up a process like this. If a system like this is run many times with a lot of data, data preprocessing need not be done at every step. You can do it once and use that result as a cache. Later steps depend on earlier steps, so you could set up a Makefile-like system to only redo steps that have changed or depend on those changes. The more times something is run, the more it pays to optimize it. diff --git a/_hands-on/parallelise/parse_vrt.py b/_hands-on/parallelise/parse_vrt.py new file mode 100644 index 00000000..702a006c --- /dev/null +++ b/_hands-on/parallelise/parse_vrt.py @@ -0,0 +1,69 @@ +import os +import sys +import time +from lxml import etree + +# a list of common semantically useless lemmas +stopwords = set([line.strip() for line in open('stopwords.txt')]) + +def is_content_word(lemma): + return lemma.isalpha() and lemma not in stopwords + +# EDIT THIS FUNCTION +def parse_vrt_in_dir(dirname): + ''' + Parse each file ending in .vrt in dirname in parallel, and return their concatenation. + ''' + start_time = time.time() + sys.stderr.write(f"Running parse_vrt_in_dir...\n"); + + # Exercise 1: parallelise parsing the corpora + # Hint: you can use the Python standard library for this + retval = [] + for filename in os.listdir(dirname): + if not filename.endswith('.vrt'): + continue + retval += vrt2lemmalists(os.path.join(dirname, filename)) + + # How long did we take? + sys.stderr.write( + f"...finished in {time.time() - start_time:.2f} s\n") + return retval + +def vrt2lemmalists(filename, max_texts = None, lemma_col = 3): + ''' + Parse each text in a VRT file into a list of lemmas, and return a list of those lists. + ''' + + sys.stderr.write(f" Reading {filename}\n") + retval = [] + fobj = open(filename, "rb") + parser = etree.XMLParser(recover = True) + + text_count = 0 + token_count = 0 + for line in fobj: + if max_texts and text_count >= max_texts: + break + parser.feed(line) + if line.strip() != b'
': + continue + # A text has ended + text_count += 1 + text = parser.close() + this_text = [] + for leaf in text.iter(): + tokens = leaf.text.strip() + if tokens == "": + continue + for token in tokens.split('\n'): + token_count += 1 + lemma = token.split('\t')[lemma_col-1] + if is_content_word(lemma): + this_text.append(lemma) + retval.append(this_text) + sys.stderr.write(f" Finished reading {filename}, {text_count} texts and {token_count} tokens\n") + return retval + +if __name__ == '__main__': + parse_vrt_in_dir(sys.argv[1]) diff --git a/_hands-on/parallelise/parse_vrt_solution.py b/_hands-on/parallelise/parse_vrt_solution.py new file mode 100644 index 00000000..d4c0bbf6 --- /dev/null +++ b/_hands-on/parallelise/parse_vrt_solution.py @@ -0,0 +1,71 @@ +import os +import sys +import time +from lxml import etree + +# process-based parallelism +from multiprocessing import Pool + +# a list of common semantically useless lemmas +stopwords = set([line.strip() for line in open('stopwords.txt')]) + +def is_content_word(lemma): + return lemma.isalpha() and lemma not in stopwords + +def parse_vrt_in_dir(dirname): + ''' + Parse each file ending in .vrt in dirname in parallel, and return their concatenation. + ''' + start_time = time.time() + sys.stderr.write(f"Running parse_vrt_in_dir...\n"); + + # Exercise 1 solution (one possible one): we map each filename to a + # vrt2lemmalists call using multiprocessing.Pool + retval = [] + # First we get the valid file names + filenames = [os.path.join(dirname, filename) for filename in os.listdir(dirname) if filename.endswith('.vrt')] + # Then we initialize a Pool object + with Pool() as pool: # by default, processes = number of cores + for result in pool.map(vrt2lemmalists, filenames): + retval += result + # How long did we take? + sys.stderr.write( + f"...finished in {time.time() - start_time:.2f} s\n") + return retval + +def vrt2lemmalists(filename, max_texts = None, lemma_col = 3): + ''' + Parse each text in a VRT file into a list of lemmas, and return a list of those lists. + ''' + + sys.stderr.write(f"Reading {filename}\n") + retval = [] + fobj = open(filename, "rb") + parser = etree.XMLParser(recover = True) + + text_count = 0 + token_count = 0 + for line in fobj: + if max_texts and text_count >= max_texts: + break + parser.feed(line) + if line.strip() != b'': + continue + # A text has ended + text_count += 1 + text = parser.close() + this_text = [] + for leaf in text.iter(): + tokens = leaf.text.strip() + if tokens != "": + for token in tokens.split('\n'): + token_count += 1 + lemma = token.split('\t')[lemma_col-1] + if is_content_word(lemma): + this_text.append(lemma) + retval.append(this_text) + sys.stderr.write(f"Finished reading {filename}, {text_count} texts and {token_count} tokens\n") + return retval + +if __name__ == '__main__': + parse_vrt_in_dir(sys.argv[1]) diff --git a/_hands-on/parallelise/requirements.txt b/_hands-on/parallelise/requirements.txt new file mode 100644 index 00000000..bfa22dd4 --- /dev/null +++ b/_hands-on/parallelise/requirements.txt @@ -0,0 +1,2 @@ +lxml +gensim diff --git a/_hands-on/parallelise/stopwords.txt b/_hands-on/parallelise/stopwords.txt new file mode 100644 index 00000000..6301bfd4 --- /dev/null +++ b/_hands-on/parallelise/stopwords.txt @@ -0,0 +1,50 @@ +ei +että +hän +ja +jo +joka +jos +kuin +kun +mikä +mutta +muu +myös +niin +olla +oma +pitää +saada +se +tehdä +tulla +tuo +tämä +voida +vuosi +mukaan +aika +uusi +kertoa +sanoa +nyt +kaikki +tai +kaksi +toinen +viime +hyvä +jälkeen +vielä +yksi +asia +kanssa +yli +osa +ensimmäinen +sekä +alkaa +vain +sitten +mennä diff --git a/_hands-on/parallelise/topics.py b/_hands-on/parallelise/topics.py new file mode 100644 index 00000000..e4e8e359 --- /dev/null +++ b/_hands-on/parallelise/topics.py @@ -0,0 +1,58 @@ +# Comments beginning with "Exercise" mark places to edit the code + +import gensim +import os +import sys +import time +from parse_vrt import parse_vrt_in_dir + +n_topics = 10 + +processed_corpus = [] +dirname = sys.argv[1] + +start_time = time.time() +sys.stderr.write(f"Running parse_vrt_in_dir...\n"); +corpus_lemmalists = parse_vrt_in_dir(dirname) +sys.stderr.write( + f"...finished in {time.time() - start_time:.2f} s\n") + +sys.stderr.write("Building gensim dictionary... "); sys.stderr.flush() +start_time = time.time() + +# Exercise 4: Parallelise building the dictionary +# Hint: the dictionary has a merge_with(other) method +dictionary = gensim.corpora.Dictionary(corpus_lemmalists) +sys.stderr.write(f"Done in {time.time() - start_time:.2f} s\n") +sys.stderr.write("Computing BOW corpus... "); sys.stderr.flush() +start_time = time.time() + +# Exercise 3: Parallelise computing bow_corpus +# Hint: send the corpus in suitable-sized chunks to processes that map +# the corpus with the function dictionary.doc2bow +bow_corpus = [dictionary.doc2bow(text) for text in corpus_lemmalists] +sys.stderr.write(f"Done in {time.time() - start_time:.2f} s\n") +sys.stderr.write("Computing LDA model... "); sys.stderr.flush() +start_time = time.time() + +# Exercise 2: replace LdaModel with a parallel version +# Hint: you can simply replace the model name, but do look at the API, +# choose a number of processes, and test which one works best. Warning: +# memory consumption will grow with number of processes, it's possible to run +# out if you have a lot of cores! +lda = gensim.models.LdaModel(bow_corpus, num_topics = n_topics) +sys.stderr.write(f"Done in {time.time() - start_time:.2f} s\n") + +sys.stderr.write("Computing model coherence... \n") +start_time = time.time() +cm = gensim.models.coherencemodel.CoherenceModel( + model=lda, corpus=bow_corpus, dictionary=dictionary, coherence='u_mass') +print(f" Coherence with {n_topics} topics was {cm.get_coherence()}") +sys.stderr.write(f"Done in {time.time() - start_time:.2f} s\n") + +for topic in enumerate(lda.show_topics(num_topics = n_topics, + num_words = 10, + formatted = False)): + print(f"Topic {topic[0] + 1}:") + for word, probability in topic[1][1]: + print(" " + dictionary[int(word)]) diff --git a/_hands-on/parallelise/topics_solution.py b/_hands-on/parallelise/topics_solution.py new file mode 100644 index 00000000..c62b9a48 --- /dev/null +++ b/_hands-on/parallelise/topics_solution.py @@ -0,0 +1,59 @@ +import gensim +import os +import sys +import time +from parse_vrt_solution import parse_vrt_in_dir +from multiprocessing import Pool + +processed_corpus = [] +dirname = sys.argv[1] +n_workers = 4 + +corpus_lemmalists = parse_vrt_in_dir(dirname) + +sys.stderr.write("Building gensim dictionary... "); sys.stderr.flush() +start_time = time.time() + +# Solution to exercise 4: Parallelise computing the gensim dictionary +# In this case, we need to build one object (the dictionary) out of a +# collection of data. The object in question has a method .merge_with(other), +# which we can use to turn a collection of objects into one. But we also +# need to split the source data, which is a list, into sublists. + +# Split a list into sublists of length n +# returns a generator, so we don't generate a whole new list with everything +def split_list(l, n): + return (l[i:i+n] for i in range(0, len(l), n)) + +dictionary = None +with Pool(processes = n_workers) as pool: + for sub_dictionary in pool.map(gensim.corpora.Dictionary, + split_list(corpus_lemmalists, 5000)): + if dictionary is None: + dictionary = sub_dictionary + else: + dictionary.merge_with(sub_dictionary) + +dictionary = gensim.corpora.Dictionary(corpus_lemmalists) +sys.stderr.write(f"Done in {time.time() - start_time:.2f} s\n") +sys.stderr.write("Computing BOW corpus... "); sys.stderr.flush() +start_time = time.time() + +# Solution to exercise 3: Parallelise computing bow_corpus +with Pool(processes = n_workers) as pool: + bow_corpus = pool.map(dictionary.doc2bow, corpus_lemmalists) +sys.stderr.write(f"Done in {time.time() - start_time:.2f} s\n") + +# Exercise 2: replace LdaModel with a parallel version +sys.stderr.write("Computing LDA model... "); sys.stderr.flush() +start_time = time.time() +# Workers should be number of physical cores, up to a limit. Good idea +# to test this if it's important. +lda = gensim.models.LdaMulticore(bow_corpus, num_topics = 10, workers = n_workers) +sys.stderr.write(f"Done in {time.time() - start_time:.2f} s\n") +for topic in enumerate(lda.show_topics(num_topics = 10, + num_words = 10, + formatted = False)): + print(f"Topic {topic[0] + 1}:") + for word, probability in topic[1][1]: + print(" " + dictionary[int(word)])