Skip to content

Commit ee7ae7d

Browse files
committed
feat: update crawling exercises to be about JS
1 parent 3f87295 commit ee7ae7d

File tree

2 files changed

+71
-59
lines changed

2 files changed

+71
-59
lines changed

sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md

Lines changed: 68 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ In the next lesson, we'll scrape the product detail pages so that each product v
206206

207207
### Scrape calling codes of African countries
208208

209-
This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL:
209+
Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL:
210210

211211
```text
212212
https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
@@ -225,43 +225,53 @@ https://en.wikipedia.org/wiki/Cameroon +237
225225
...
226226
```
227227

228-
Hint: Locating cells in tables is sometimes easier if you know how to [navigate up](https://cheerio.js.org/docs/api/classes/Cheerio#parent) in the HTML element tree.
228+
Hint: Locating cells in tables is sometimes easier if you know how to [filter](https://cheerio.js.org/docs/api/classes/Cheerio#filter) or [navigate up](https://cheerio.js.org/docs/api/classes/Cheerio#parent) in the HTML element tree.
229229

230230
<details>
231231
<summary>Solution</summary>
232232

233-
```py
234-
import httpx
235-
from bs4 import BeautifulSoup
236-
from urllib.parse import urljoin
237-
238-
def download(url):
239-
response = httpx.get(url)
240-
response.raise_for_status()
241-
return BeautifulSoup(response.text, "html.parser")
242-
243-
def parse_calling_code(soup):
244-
for label in soup.select("th.infobox-label"):
245-
if label.text.strip() == "Calling code":
246-
data = label.parent.select_one("td.infobox-data")
247-
return data.text.strip()
248-
return None
249-
250-
listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
251-
listing_soup = download(listing_url)
252-
for name_cell in listing_soup.select(".wikitable tr td:nth-child(3)"):
253-
link = name_cell.select_one("a")
254-
country_url = urljoin(listing_url, link["href"])
255-
country_soup = download(country_url)
256-
calling_code = parse_calling_code(country_soup)
257-
print(country_url, calling_code)
233+
```js
234+
import * as cheerio from 'cheerio';
235+
236+
async function download(url) {
237+
const response = await fetch(url);
238+
if (response.ok) {
239+
const html = await response.text();
240+
return cheerio.load(html);
241+
} else {
242+
throw new Error(`HTTP ${response.status}`);
243+
}
244+
}
245+
246+
const listingURL = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa";
247+
const $ = await download(listingURL);
248+
249+
const $promises = $(".wikitable tr td:nth-child(3)").map(async (i, element) => {
250+
const $nameCell = $(element);
251+
const $link = $nameCell.find("a").first();
252+
const countryURL = new URL($link.attr("href"), listingURL).href;
253+
254+
const $c = await download(countryURL);
255+
const $label = $c("th.infobox-label")
256+
.filter((i, element) => $c(element).text().trim() == "Calling code")
257+
.first();
258+
const callingCode = $label
259+
.parent()
260+
.find("td.infobox-data")
261+
.first()
262+
.text()
263+
.trim();
264+
265+
console.log(`${countryURL} ${callingCode || null}`);
266+
});
267+
await Promise.all($promises.get());
258268
```
259269

260270
</details>
261271

262272
### Scrape authors of F1 news articles
263273

264-
This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to the Guardian's latest F1 news articles. For each article, follow the link and extract both the author's name and the article's title. Print the author's name and the title for all the articles. Start with this URL:
274+
Scrape links to the Guardian's latest F1 news articles. For each article, follow the link and extract both the author's name and the article's title. Print the author's name and the title for all the articles. Start with this URL:
265275

266276
```text
267277
https://www.theguardian.com/sport/formulaone
@@ -286,34 +296,36 @@ Hints:
286296
<details>
287297
<summary>Solution</summary>
288298

289-
```py
290-
import httpx
291-
from bs4 import BeautifulSoup
292-
from urllib.parse import urljoin
293-
294-
def download(url):
295-
response = httpx.get(url)
296-
response.raise_for_status()
297-
return BeautifulSoup(response.text, "html.parser")
298-
299-
def parse_author(article_soup):
300-
link = article_soup.select_one('aside a[rel="author"]')
301-
if link:
302-
return link.text.strip()
303-
address = article_soup.select_one('aside address')
304-
if address:
305-
return address.text.strip()
306-
return None
307-
308-
listing_url = "https://www.theguardian.com/sport/formulaone"
309-
listing_soup = download(listing_url)
310-
for item in listing_soup.select("#maincontent ul li"):
311-
link = item.select_one("a")
312-
article_url = urljoin(listing_url, link["href"])
313-
article_soup = download(article_url)
314-
title = article_soup.select_one("h1").text.strip()
315-
author = parse_author(article_soup)
316-
print(f"{author}: {title}")
299+
```js
300+
import * as cheerio from 'cheerio';
301+
302+
async function download(url) {
303+
const response = await fetch(url);
304+
if (response.ok) {
305+
const html = await response.text();
306+
return cheerio.load(html);
307+
} else {
308+
throw new Error(`HTTP ${response.status}`);
309+
}
310+
}
311+
312+
const listingURL = "https://www.theguardian.com/sport/formulaone";
313+
const $ = await download(listingURL);
314+
315+
const $promises = $("#maincontent ul li").map(async (i, element) => {
316+
const $item = $(element);
317+
const $link = $item.find("a").first();
318+
const authorURL = new URL($link.attr("href"), listingURL).href;
319+
320+
const $a = await download(authorURL);
321+
const title = $a("h1").text().trim();
322+
323+
const author = $a('a[rel="author"]').text().trim();
324+
const address = $a('aside address').text().trim();
325+
326+
console.log(`${author || address || null}: ${title}`);
327+
});
328+
await Promise.all($promises.get());
317329
```
318330

319331
</details>

sources/academy/webscraping/scraping_basics_python/10_crawling.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ In the next lesson, we'll scrape the product detail pages so that each product v
187187

188188
### Scrape calling codes of African countries
189189

190-
This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL:
190+
Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL:
191191

192192
```text
193193
https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
@@ -242,7 +242,7 @@ Hint: Locating cells in tables is sometimes easier if you know how to [navigate
242242

243243
### Scrape authors of F1 news articles
244244

245-
This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to the Guardian's latest F1 news articles. For each article, follow the link and extract both the author's name and the article's title. Print the author's name and the title for all the articles. Start with this URL:
245+
Scrape links to the Guardian's latest F1 news articles. For each article, follow the link and extract both the author's name and the article's title. Print the author's name and the title for all the articles. Start with this URL:
246246

247247
```text
248248
https://www.theguardian.com/sport/formulaone
@@ -278,7 +278,7 @@ Hints:
278278
return BeautifulSoup(response.text, "html.parser")
279279

280280
def parse_author(article_soup):
281-
link = article_soup.select_one('aside a[rel="author"]')
281+
link = article_soup.select_one('a[rel="author"]')
282282
if link:
283283
return link.text.strip()
284284
address = article_soup.select_one('aside address')

0 commit comments

Comments
 (0)