Skip to content

Commit 27a7cd3

Browse files
committed
Changes dependency of HTML Importer from Mechanize to just Nokogiri
1 parent fd08213 commit 27a7cd3

File tree

5 files changed

+40
-33
lines changed

5 files changed

+40
-33
lines changed

Gemfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ group :optional do
77
gem 'dbd-sqlite3'
88
gem 'dbi'
99
gem 'jsonpath'
10-
gem 'mechanize'
1110
gem 'mongo'
11+
gem 'nokogiri'
1212
gem 'redis'
1313
gem 'roo', '~> 2.7.0'
1414
gem 'rsruby'

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ Imports a **Daru::DataFrame** from a **.xlsx** file.
192192
Imports an **Array** of **Daru::DataFrame**s from a **.html** file or website.
193193

194194
- **Docs**: [rubydoc.info](http://www.rubydoc.info/github/athityakumar/daru-io/master/Daru/IO/Importers/HTML)
195-
- **Gem Dependencies**: `mechanize` gem
195+
- **Gem Dependencies**: `nokogiri` gem
196196
- **Usage**:
197197
```ruby
198198
#! Partially require just HTML Importer

lib/daru/io/importers/html.rb

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ class HTML < Base
1414

1515
# Checks for required gem dependencies of HTML Importer
1616
def initialize
17-
optional_gem 'mechanize'
17+
require 'open-uri'
18+
optional_gem 'nokogiri'
1819
end
1920

2021
# Reads from a html file / website
@@ -29,7 +30,7 @@ def initialize
2930
# @example Reading from a website url file
3031
# instance = Daru::IO::Importers::HTML.read('http://www.moneycontrol.com/')
3132
def read(path)
32-
@file_data = Mechanize.new.get(path)
33+
@file_data = Nokogiri.parse(open(path).read)
3334
self
3435
end
3536

@@ -72,25 +73,23 @@ def read(path)
7273
# # 3 ITC 315.85 6.75 621.12
7374
# # 4 HDFC 1598.85 50.95 553.91
7475
def call(match: nil, order: nil, index: nil, name: nil)
75-
@match = match
76-
@options = {name: name, order: order, index: index}
76+
@match = match
77+
@options = {name: name, index: index, order: order}
7778

7879
@file_data
79-
.search('table').map { |table| parse_table table }
80-
.keep_if { |table| search table }
80+
.search('table')
81+
.map { |table| parse_table(table) }
8182
.compact
82-
.map { |table| decide_values table, @options }
83-
.map { |table| table_to_dataframe table }
83+
.keep_if { |table| satisfy_dimension(table) && search(table) }
84+
.map { |table| decide_values(table, @options) }
85+
.map { |table| table_to_dataframe(table) }
8486
end
8587

8688
private
8789

8890
# Allows user to override the scraped order / index / data
89-
def decide_values(scraped_val={}, user_val={})
90-
%I[data index name order].each do |key|
91-
user_val[key] ||= scraped_val[key]
92-
end
93-
user_val
91+
def decide_values(scraped_val, user_val)
92+
scraped_val.merge(user_val) { |_key, scraped, user| user || scraped }
9493
end
9594

9695
# Splits headers (all th tags) into order and index. Wherein,
@@ -121,15 +120,23 @@ def scrape_tag(table, tag)
121120
[arr, size]
122121
end
123122

123+
def satisfy_dimension(table)
124+
return false if @options[:order] && table[:data].first.size != @options[:order].size
125+
return false if @options[:index] && table[:data].size != @options[:index].size
126+
true
127+
end
128+
124129
def search(table)
125-
@match.nil? ? true : (table.to_s.include? @match)
130+
@match.nil? ? true : table.to_s.include?(@match)
126131
end
127132

128133
def table_to_dataframe(table)
129-
Daru::DataFrame.rows table[:data],
134+
Daru::DataFrame.rows(
135+
table[:data],
130136
index: table[:index],
131137
order: table[:order],
132138
name: table[:name]
139+
)
133140
end
134141
end
135142
end

spec/daru/io/importers/html_spec.rb

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
let(:df_index) { 0 }
66

77
context 'in wiki info table' do
8-
let(:path) { "file://#{Dir.pwd}/spec/fixtures/html/wiki_table_info.html" }
9-
let(:order) { %w[FName LName Age] }
10-
let(:index) { %w[One Two Three Four Five Six Seven] }
11-
let(:name) { 'Wikipedia Information Table' }
8+
let(:path) { 'spec/fixtures/html/wiki_table_info.html' }
9+
let(:order) { %w[FName LName Age] }
10+
let(:index) { %w[One Two Three Four Five Six Seven] }
11+
let(:name) { 'Wikipedia Information Table' }
1212

1313
context 'returns default dataframe' do
1414
it_behaves_like 'exact daru dataframe',
@@ -40,7 +40,7 @@
4040
end
4141

4242
context 'in wiki climate data' do
43-
let(:path) { "file://#{Dir.pwd}/spec/fixtures/html/wiki_climate.html" }
43+
let(:path) { 'spec/fixtures/html/wiki_climate.html' }
4444

4545
context 'returns default dataframe' do
4646
it_behaves_like 'exact daru dataframe',
@@ -58,9 +58,9 @@
5858
end
5959

6060
context 'with valid html table markups' do
61-
let(:path) { "file://#{Dir.pwd}/spec/fixtures/html/valid_markup.html" }
62-
let(:index) { %w[W X Y Z] }
63-
let(:name) { 'Small HTML table with index' }
61+
let(:path) { 'spec/fixtures/html/valid_markup.html' }
62+
let(:index) { %w[W X Y Z] }
63+
let(:name) { 'Small HTML table with index' }
6464

6565
context 'returns user-modified dataframe' do
6666
let(:opts) { {index: index, name: name} }
@@ -76,9 +76,9 @@
7676
end
7777

7878
context 'in year-wise passengers figure' do
79-
let(:path) { "file://#{Dir.pwd}/spec/fixtures/html/macau.html" }
80-
let(:match) { '2001' }
81-
let(:name) { 'Year-wise Passengers Figure' }
79+
let(:path) { 'spec/fixtures/html/macau.html' }
80+
let(:match) { '2001' }
81+
let(:name) { 'Year-wise Passengers Figure' }
8282

8383
context 'returns matching dataframes with index' do
8484
let(:opts) { {match: match, name: name} }
@@ -108,9 +108,9 @@
108108
end
109109

110110
context 'in share market data' do
111-
let(:path) { "file://#{Dir.pwd}/spec/fixtures/html/moneycontrol.html" }
112-
let(:match) { 'Sun Pharma' }
113-
let(:index) { %w[Alpha Beta Gamma Delta Misc] }
111+
let(:path) { 'spec/fixtures/html/moneycontrol.html' }
112+
let(:match) { 'Sun Pharma' }
113+
let(:index) { %w[Alpha Beta Gamma Delta Misc] }
114114
let(:name) { 'Share Market Analysis' }
115115

116116
context 'returns matching dataframes' do
@@ -149,7 +149,7 @@
149149
end
150150

151151
context 'in election results data' do
152-
let(:path) { "file://#{Dir.pwd}/spec/fixtures/html/eciresults.html" }
152+
let(:path) { 'spec/fixtures/html/eciresults.html' }
153153

154154
context 'returns default dataframes' do
155155
it_behaves_like 'exact daru dataframe',

spec/spec_helper.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
require 'redis'
2020
require 'dbi'
2121
require 'jsonpath'
22-
require 'mechanize'
22+
require 'nokogiri'
2323
require 'mongo'
2424
require 'spreadsheet'
2525
require 'sqlite3'

0 commit comments

Comments
 (0)