Skip to content

Commit 3c4a734

Browse files
committed
Scraping
1 parent 665f486 commit 3c4a734

File tree

2 files changed

+839
-731
lines changed

2 files changed

+839
-731
lines changed

Web Sraping/Dawn Latest news Scraping.ipynb

Lines changed: 761 additions & 731 deletions
Large diffs are not rendered by default.

Web Sraping/Photos.ipynb

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "initial_id",
7+
"metadata": {
8+
"collapsed": true
9+
},
10+
"outputs": [],
11+
"source": [
12+
"# STEP 1\n",
13+
"BASE_DIR = 'images/'\n",
14+
"\n",
15+
"# STEP 2\n",
16+
"SUB_DIRS = [topic + '/' for topic in df['Topic'].unique()]\n",
17+
"\n",
18+
"# Print a message to the user\n",
19+
"print('Image Download Started...')\n",
20+
"start_time = datetime.datetime.now()\n",
21+
"\n",
22+
"# STEP 3\n",
23+
"for sub_dir in SUB_DIRS:\n",
24+
" if not os.path.exists(BASE_DIR + sub_dir):\n",
25+
" os.makedirs(BASE_DIR + sub_dir)\n",
26+
"\n",
27+
"# STEP 4\n",
28+
"for topic in SUB_DIRS:\n",
29+
" # filtering based on topic values\n",
30+
" for ind, row in df[df['Topic'] == topic.split('/')[0]].iterrows():\n",
31+
"\n",
32+
" # STEP 5\n",
33+
" urllib.request.urlretrieve(\n",
34+
" row['Thumbnail'],\n",
35+
" '{}{}.jpg'.format(\n",
36+
" BASE_DIR + topic,\n",
37+
" (row['Title']\n",
38+
" # removing unwanted characters\n",
39+
" .lower()\n",
40+
" .replace(' ', '_')\n",
41+
" .replace('.', '')\n",
42+
" .replace(',', '')\n",
43+
" .replace(':', '')\n",
44+
" .replace('\\'', '')\n",
45+
" .replace('’', '')\n",
46+
" .replace('#', '')\n",
47+
" .replace('*', ''))\n",
48+
" )\n",
49+
" )\n",
50+
"\n",
51+
"# Tell the user that download has finished\n",
52+
"end_time = datetime.datetime.now()\n",
53+
"print('\\tDownload Finished! It took {} seconds.'.format(int((end_time - start_time).total_seconds())))\n"
54+
]
55+
}
56+
],
57+
"metadata": {
58+
"kernelspec": {
59+
"display_name": "Python 3",
60+
"language": "python",
61+
"name": "python3"
62+
},
63+
"language_info": {
64+
"codemirror_mode": {
65+
"name": "ipython",
66+
"version": 2
67+
},
68+
"file_extension": ".py",
69+
"mimetype": "text/x-python",
70+
"name": "python",
71+
"nbconvert_exporter": "python",
72+
"pygments_lexer": "ipython2",
73+
"version": "2.7.6"
74+
}
75+
},
76+
"nbformat": 4,
77+
"nbformat_minor": 5
78+
}

0 commit comments

Comments
 (0)