Skip to content

Commit 4e04316

Browse files
committed
GH-7: Fix hang when encountering an unexpected encoding.
1 parent 9bc5588 commit 4e04316

File tree

6 files changed

+127
-36
lines changed

6 files changed

+127
-36
lines changed

Sources/Document.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import Foundation
33
/**
44
Convenience model for working with CSV in-memory.
55

6-
- Note: Take advantage of data streaming capabilities when possible.
6+
- Note: Take advantage of data streaming capabilities when possible instead of using this convenience class.
77
*/
88
public class Document: InputHandlerDelegate {
99

@@ -61,6 +61,8 @@ public class Document: InputHandlerDelegate {
6161

6262
/**
6363
Initialize a document from a CSV-formatted file.
64+
65+
- Note: Although this streams input data from the `FileHandle` the resulting document is still the full physical representation of the data.
6466
*/
6567
public convenience init(fileHandle: FileHandle, dialect: Dialect = Dialect()) throws {
6668
self.init(dialect: dialect)

Sources/InputHandler.swift

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ public protocol InputHandlerDelegate: class {
3939
public class InputHandler {
4040

4141
public static let defaultByteLength = 128
42+
public static let defaultMaxRetries = 3
4243

4344
/**
4445
Delegate responsible for handling events from the input stream.
@@ -53,29 +54,33 @@ public class InputHandler {
5354
*/
5455
private(set) public var isOpen = false
5556
private var data = Data()
57+
private let maxRetries: Int
58+
private var retries: Int = 0
5659
private let fileHandle: FileHandle
5760
private var parser: ImportParser
5861

5962
/**
6063
- Parameter fileHandle: FileHandle for reading. InputHandler should be solely responsible for controlling seeking behavior during its lifetime. The FileHandle's seek position should be at the beginning.
6164
- Parameter dialect: Dialect from which to parse against.
65+
- Parameter maxRetries: Maximum number of allowed consecutive retries
6266
*/
63-
public init(fileHandle: FileHandle, dialect: Dialect = Dialect()) {
67+
public init(fileHandle: FileHandle, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) {
6468
self.fileHandle = fileHandle
6569
self.dialect = dialect
70+
self.maxRetries = maxRetries
6671
self.parser = ImportParser(dialect: dialect)
6772
}
6873

69-
public convenience init(from url: URL, dialect: Dialect = Dialect()) throws {
74+
public convenience init(from url: URL, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) throws {
7075
let fileHandle = try FileHandle(forReadingFrom: url)
71-
self.init(fileHandle: fileHandle, dialect: dialect)
76+
self.init(fileHandle: fileHandle, dialect: dialect, maxRetries: maxRetries)
7277
}
7378

74-
public convenience init?(atPath path: String, dialect: Dialect = Dialect()) {
79+
public convenience init?(atPath path: String, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) {
7580
guard let fileHandle = FileHandle(forReadingAtPath: path) else {
7681
return nil
7782
}
78-
self.init(fileHandle: fileHandle, dialect: dialect)
83+
self.init(fileHandle: fileHandle, dialect: dialect, maxRetries: maxRetries)
7984
}
8085

8186
deinit {
@@ -85,8 +90,8 @@ public class InputHandler {
8590
/**
8691
Reads and parses a chunk of data from the FileHandle and directs output to its delegate.
8792

88-
- Parameter length: Maximum byte length of data to read.
89-
- Return: Whether data was read. False indicates end of file.
93+
- Parameter length: Maximum number of bytes of data to read.
94+
- Returns: Whether data was read and to continue reading. Data may have been buffered. False indicates end of file.
9095
- Throws: May throw anything the `InputHandlerDelegate` or `ImportParser` throws. If thrown, state should be assumed to be invalid and should be reset before attempting to read again.
9196
*/
9297
public func read(length: Int = InputHandler.defaultByteLength) throws -> Bool {
@@ -100,7 +105,13 @@ public class InputHandler {
100105
do {
101106
rows = try self.parser.import(data: data)
102107
} catch ImportParser.ImportError.badEncoding {
103-
return true // We may have broken utf8 in receiving incomplete data
108+
self.retries += 1
109+
// We may have received incomplete data that broke UTF-8 decoding due to variable byte widths
110+
// We give the reader an opportunity to read more before throwing that error
111+
guard self.retries <= self.maxRetries else {
112+
throw ImportParser.ImportError.badEncoding
113+
}
114+
return true
104115
}
105116
var dropHeader = false
106117

@@ -118,14 +129,15 @@ public class InputHandler {
118129
try self.delegate?.append(records: dropHeader ? Array(rows.dropFirst()) : rows)
119130

120131
self.data = Data()
132+
self.retries = 0
121133

122134
return true
123135
}
124136

125137
/**
126138
Convenience for reading the FileHandle and handling parsing through to the end of file.
127139

128-
- Parameter length: Maximum byte length of data to read at each iteration.
140+
- Parameter length: Maximum number of bytes of data to read at each iteration.
129141
*/
130142
public func readToEndOfFile(length: Int = InputHandler.defaultByteLength) throws {
131143
while try self.read(length: length) {}
@@ -151,6 +163,7 @@ public class InputHandler {
151163
self.parser = ImportParser(dialect: dialect)
152164
self.isOpen = false
153165
self.data = Data()
166+
self.retries = 0
154167
self.delegate?.reset()
155168
return true
156169
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
éab,abé,aéb,abcé
2+
123,456,789,321
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
"quote","author"
2+
"Always bear in mind that your own resol�tion to succeed is more important than any other.","Abraham Lincoln"

Tests/DialectalCSVTests/ImportTests.swift

Lines changed: 71 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ class ImportTests: XCTestCase {
2121
("testTrailingComma", testTrailingComma),
2222
("testUnescapedQuotes", testUnescapedQuotes),
2323
("testUnquotedHeaders", testUnquotedHeaders),
24+
("testBadEncoding", testBadEncoding),
25+
("testVariableWidthEncodedStreamSplit", testVariableWidthEncodedStreamSplit)
2426
]
2527

2628
func testEscapeCharacter() {
@@ -105,42 +107,26 @@ class ImportTests: XCTestCase {
105107
XCTAssertEqual(document.records.count, 1)
106108
}
107109

108-
func testLineTerminatorStreamSplit() {
110+
func testLineTerminatorStreamSplit() throws {
109111
let inputURL = Utility.fixtureURL(named: "lineTerminatorStreamSplit.csv")
110112
var dialect = Dialect()
111113
dialect.header = false
112-
let inputFileHandle = try! FileHandle(forReadingFrom: inputURL)
114+
let inputFileHandle = try FileHandle(forReadingFrom: inputURL)
113115
let inputHandler = InputHandler(fileHandle: inputFileHandle, dialect: dialect)
114116

115-
class Handler: InputHandlerDelegate {
116-
var records = [Record]()
117-
118-
public func open(header: Header? = nil) throws {}
119-
120-
public func append(records: [Record]) throws {
121-
self.records.append(contentsOf: records)
122-
}
123-
124-
public func close() throws {}
125-
126-
public func reset() {}
127-
}
128-
let handler = Handler()
117+
let handler = SpyInputHandlerDelegate()
129118
inputHandler.delegate = handler
130119

131-
do {
132-
try inputHandler.readToEndOfFile(length: 8)
133-
} catch {
134-
XCTFail()
135-
}
136-
120+
try inputHandler.readToEndOfFile(length: 8)
137121
XCTAssertEqual(handler.records.count, 2)
138122

139-
XCTAssertEqual(handler.records[0][0], "abc")
140-
XCTAssertEqual(handler.records[0][1], "xyz")
123+
let first = handler.records[0]
124+
XCTAssertEqual(first[0], "abc")
125+
XCTAssertEqual(first[1], "xyz")
141126

142-
XCTAssertEqual(handler.records[1][0], "123")
143-
XCTAssertEqual(handler.records[1][1], "456")
127+
let second = handler.records[1]
128+
XCTAssertEqual(second[0], "123")
129+
XCTAssertEqual(second[1], "456")
144130
}
145131

146132
func testMissingEndOfLine() {
@@ -304,4 +290,63 @@ class ImportTests: XCTestCase {
304290
XCTAssertEqual(document.header![1], HeaderFields.author.rawValue + " name")
305291
}
306292

293+
func testBadEncoding() throws {
294+
let fileURL = Utility.fixtureURL(named: "western1252Encoded.csv")
295+
let fileHandle = try FileHandle(forReadingFrom: fileURL)
296+
do {
297+
_ = try Document(fileHandle: fileHandle)
298+
} catch ImportParser.ImportError.badEncoding {
299+
return
300+
} catch {
301+
XCTFail()
302+
return
303+
}
304+
XCTFail()
305+
}
306+
307+
func testVariableWidthEncodedStreamSplit() throws {
308+
let inputURL = Utility.fixtureURL(named: "variableWidthEncodedStreamSplit.csv")
309+
var dialect = Dialect()
310+
dialect.header = false
311+
312+
let inputFileHandle = try FileHandle(forReadingFrom: inputURL)
313+
var inputHandler = InputHandler(fileHandle: inputFileHandle, dialect: dialect)
314+
var handler = SpyInputHandlerDelegate()
315+
inputHandler.delegate = handler
316+
317+
for numberOfBytes in 1...4 {
318+
try inputHandler.readToEndOfFile(length: numberOfBytes)
319+
XCTAssertEqual(handler.records.count, 2)
320+
321+
let first = try XCTUnwrap(handler.records[safe: 0])
322+
XCTAssertEqual(first.count, 4)
323+
XCTAssertEqual(first[0], "éab")
324+
XCTAssertEqual(first[1], "abé")
325+
XCTAssertEqual(first[2], "aéb")
326+
XCTAssertEqual(first[3], "abcé")
327+
328+
let second = try XCTUnwrap(handler.records[safe: 1])
329+
XCTAssertEqual(second.count, 4)
330+
XCTAssertEqual(second[0], "123")
331+
XCTAssertEqual(second[1], "456")
332+
XCTAssertEqual(second[2], "789")
333+
XCTAssertEqual(second[3], "321")
334+
}
335+
336+
inputHandler = InputHandler(fileHandle: inputFileHandle, dialect: dialect, maxRetries: 0)
337+
handler = SpyInputHandlerDelegate()
338+
inputHandler.delegate = handler
339+
340+
for numberOfBytes in 1...4 {
341+
do {
342+
try inputHandler.readToEndOfFile(length: numberOfBytes)
343+
} catch ImportParser.ImportError.badEncoding {
344+
return
345+
} catch {
346+
XCTFail()
347+
return
348+
}
349+
}
350+
}
351+
307352
}

Tests/DialectalCSVTests/Utility.swift

100644100755
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,30 @@ class Utility {
4646
}
4747

4848
}
49+
50+
extension Array {
51+
52+
subscript(safe index: Index) -> Element? {
53+
guard index < endIndex, index >= startIndex else { return nil }
54+
return self[index]
55+
}
56+
57+
}
58+
59+
import DialectalCSV
60+
61+
class SpyInputHandlerDelegate: InputHandlerDelegate {
62+
63+
private(set) var records = [Record]()
64+
65+
public func open(header: Header? = nil) throws {}
66+
67+
public func append(records: [Record]) throws {
68+
self.records.append(contentsOf: records)
69+
}
70+
71+
public func close() throws {}
72+
73+
public func reset() {}
74+
75+
}

0 commit comments

Comments
 (0)