Skip to content

Commit 89c6c11

Browse files
committed
GH-7: Optionally specify import data encoding.
1 parent 4e04316 commit 89c6c11

File tree

4 files changed

+61
-44
lines changed

4 files changed

+61
-44
lines changed

Sources/Document.swift

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,9 @@ public class Document: InputHandlerDelegate {
5050
- Parameter data: Data which comprises of the entire document as a UTF-8 string.
5151
- Parameter dialect: Dialect from which to parse against.
5252
*/
53-
public convenience init(data: Data, dialect: Dialect = Dialect()) throws {
53+
public convenience init(data: Data, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect()) throws {
5454
let parser = ImportParser(dialect: dialect)
55-
var allRows = try parser.import(data: data)
55+
var allRows = try parser.import(data: data, encoding: encoding)
5656
if let row = try parser.flushRow() {
5757
allRows.append(row)
5858
}
@@ -64,9 +64,9 @@ public class Document: InputHandlerDelegate {
6464

6565
- Note: Although this streams input data from the `FileHandle` the resulting document is still the full physical representation of the data.
6666
*/
67-
public convenience init(fileHandle: FileHandle, dialect: Dialect = Dialect()) throws {
67+
public convenience init(fileHandle: FileHandle, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect()) throws {
6868
self.init(dialect: dialect)
69-
let inputHandler = InputHandler(fileHandle: fileHandle, dialect: dialect)
69+
let inputHandler = InputHandler(fileHandle: fileHandle, encoding: encoding, dialect: dialect)
7070
inputHandler.delegate = self
7171
try inputHandler.readToEndOfFile()
7272
}

Sources/ImportParser.swift

100644100755
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ public class ImportParser {
2525
}
2626

2727
/**
28-
- badEncoding: Indicates input could not be decoded.
28+
- badEncoding: Indicates input could not be decoded from the specified encoding.
2929
- uncaughtCharacter: An unexpected character at a 1-indexed row number.
3030
- uneven: Encountered a row whose number of values is mismatched relative to other rows. All rows are expected to contain the same number of values.
3131
*/
@@ -50,8 +50,8 @@ public class ImportParser {
5050
- Returns: Parsed rows. An incomplete row is not returned prematurely until the data is provided or a flush command is issued.
5151
- Note: It is best practice to call the flush method after having parsed the last of the input data.
5252
*/
53-
public func `import`(data: Data) throws -> [Row] {
54-
guard let string = String(data: data, encoding: String.Encoding.utf8) else {
53+
public func `import`(data: Data, encoding: String.Encoding = .utf8) throws -> [Row] {
54+
guard let string = String(data: data, encoding: encoding) else {
5555
throw ImportError.badEncoding
5656
}
5757

Sources/InputHandler.swift

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,30 +57,32 @@ public class InputHandler {
5757
private let maxRetries: Int
5858
private var retries: Int = 0
5959
private let fileHandle: FileHandle
60+
private let encoding: String.Encoding
6061
private var parser: ImportParser
6162

6263
/**
6364
- Parameter fileHandle: FileHandle for reading. InputHandler should be solely responsible for controlling seeking behavior during its lifetime. The FileHandle's seek position should be at the beginning.
6465
- Parameter dialect: Dialect from which to parse against.
6566
- Parameter maxRetries: Maximum number of allowed consecutive retries
6667
*/
67-
public init(fileHandle: FileHandle, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) {
68+
public init(fileHandle: FileHandle, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) {
6869
self.fileHandle = fileHandle
70+
self.encoding = encoding
6971
self.dialect = dialect
7072
self.maxRetries = maxRetries
7173
self.parser = ImportParser(dialect: dialect)
7274
}
7375

74-
public convenience init(from url: URL, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) throws {
76+
public convenience init(from url: URL, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) throws {
7577
let fileHandle = try FileHandle(forReadingFrom: url)
76-
self.init(fileHandle: fileHandle, dialect: dialect, maxRetries: maxRetries)
78+
self.init(fileHandle: fileHandle, encoding: encoding, dialect: dialect, maxRetries: maxRetries)
7779
}
7880

79-
public convenience init?(atPath path: String, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) {
81+
public convenience init?(atPath path: String, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) {
8082
guard let fileHandle = FileHandle(forReadingAtPath: path) else {
8183
return nil
8284
}
83-
self.init(fileHandle: fileHandle, dialect: dialect, maxRetries: maxRetries)
85+
self.init(fileHandle: fileHandle, encoding: encoding, dialect: dialect, maxRetries: maxRetries)
8486
}
8587

8688
deinit {
@@ -103,7 +105,7 @@ public class InputHandler {
103105

104106
var rows = [Row]()
105107
do {
106-
rows = try self.parser.import(data: data)
108+
rows = try self.parser.import(data: data, encoding: encoding)
107109
} catch ImportParser.ImportError.badEncoding {
108110
self.retries += 1
109111
// We may have received incomplete data that broke UTF-8 decoding due to variable byte widths

Tests/DialectalCSVTests/ImportTests.swift

Lines changed: 46 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import XCTest
44
class ImportTests: XCTestCase {
55

66
static var allTests = [
7+
("testBadEncoding", testBadEncoding),
78
("testEscapeCharacter", testEscapeCharacter),
89
("testEscapeDoubleQuote", testEscapeDoubleQuote),
910
("testHeadersOnly", testHeadersOnly),
@@ -21,10 +22,24 @@ class ImportTests: XCTestCase {
2122
("testTrailingComma", testTrailingComma),
2223
("testUnescapedQuotes", testUnescapedQuotes),
2324
("testUnquotedHeaders", testUnquotedHeaders),
24-
("testBadEncoding", testBadEncoding),
25-
("testVariableWidthEncodedStreamSplit", testVariableWidthEncodedStreamSplit)
25+
("testVariableWidthEncodedStreamSplit", testVariableWidthEncodedStreamSplit),
26+
("testWestern1252Encoding", testWestern1252Encoding)
2627
]
2728

29+
func testBadEncoding() throws {
30+
let fileURL = Utility.fixtureURL(named: "western1252Encoded.csv")
31+
let fileHandle = try FileHandle(forReadingFrom: fileURL)
32+
do {
33+
_ = try Document(fileHandle: fileHandle)
34+
} catch ImportParser.ImportError.badEncoding {
35+
return
36+
} catch {
37+
XCTFail()
38+
return
39+
}
40+
XCTFail()
41+
}
42+
2843
func testEscapeCharacter() {
2944
let data = Utility.fixture(named: "escapeCharacter.csv")
3045
var dialect = Dialect()
@@ -290,52 +305,38 @@ class ImportTests: XCTestCase {
290305
XCTAssertEqual(document.header![1], HeaderFields.author.rawValue + " name")
291306
}
292307

293-
func testBadEncoding() throws {
294-
let fileURL = Utility.fixtureURL(named: "western1252Encoded.csv")
295-
let fileHandle = try FileHandle(forReadingFrom: fileURL)
296-
do {
297-
_ = try Document(fileHandle: fileHandle)
298-
} catch ImportParser.ImportError.badEncoding {
299-
return
300-
} catch {
301-
XCTFail()
302-
return
303-
}
304-
XCTFail()
305-
}
306-
307308
func testVariableWidthEncodedStreamSplit() throws {
308309
let inputURL = Utility.fixtureURL(named: "variableWidthEncodedStreamSplit.csv")
309310
var dialect = Dialect()
310311
dialect.header = false
311312

312313
let inputFileHandle = try FileHandle(forReadingFrom: inputURL)
313314
var inputHandler = InputHandler(fileHandle: inputFileHandle, dialect: dialect)
314-
var handler = SpyInputHandlerDelegate()
315-
inputHandler.delegate = handler
315+
var outputSpy = SpyInputHandlerDelegate()
316+
inputHandler.delegate = outputSpy
316317

317318
for numberOfBytes in 1...4 {
318319
try inputHandler.readToEndOfFile(length: numberOfBytes)
319-
XCTAssertEqual(handler.records.count, 2)
320+
XCTAssertEqual(outputSpy.records.count, 2)
320321

321-
let first = try XCTUnwrap(handler.records[safe: 0])
322+
let first = try XCTUnwrap(outputSpy.records[safe: 0])
322323
XCTAssertEqual(first.count, 4)
323-
XCTAssertEqual(first[0], "éab")
324-
XCTAssertEqual(first[1], "abé")
325-
XCTAssertEqual(first[2], "aéb")
326-
XCTAssertEqual(first[3], "abcé")
324+
XCTAssertEqual(first[safe: 0], "éab")
325+
XCTAssertEqual(first[safe: 1], "abé")
326+
XCTAssertEqual(first[safe: 2], "aéb")
327+
XCTAssertEqual(first[safe: 3], "abcé")
327328

328-
let second = try XCTUnwrap(handler.records[safe: 1])
329+
let second = try XCTUnwrap(outputSpy.records[safe: 1])
329330
XCTAssertEqual(second.count, 4)
330-
XCTAssertEqual(second[0], "123")
331-
XCTAssertEqual(second[1], "456")
332-
XCTAssertEqual(second[2], "789")
333-
XCTAssertEqual(second[3], "321")
331+
XCTAssertEqual(second[safe: 0], "123")
332+
XCTAssertEqual(second[safe: 1], "456")
333+
XCTAssertEqual(second[safe: 2], "789")
334+
XCTAssertEqual(second[safe: 3], "321")
334335
}
335336

336337
inputHandler = InputHandler(fileHandle: inputFileHandle, dialect: dialect, maxRetries: 0)
337-
handler = SpyInputHandlerDelegate()
338-
inputHandler.delegate = handler
338+
outputSpy = SpyInputHandlerDelegate()
339+
inputHandler.delegate = outputSpy
339340

340341
for numberOfBytes in 1...4 {
341342
do {
@@ -349,4 +350,18 @@ class ImportTests: XCTestCase {
349350
}
350351
}
351352

353+
func testWestern1252Encoding() throws {
354+
let inputURL = Utility.fixtureURL(named: "western1252Encoded.csv")
355+
let inputFileHandle = try FileHandle(forReadingFrom: inputURL)
356+
let inputHandler = InputHandler(fileHandle: inputFileHandle, encoding: .windowsCP1252)
357+
let outputSpy = SpyInputHandlerDelegate()
358+
inputHandler.delegate = outputSpy
359+
360+
try inputHandler.readToEndOfFile()
361+
XCTAssertEqual(outputSpy.records.count, 1)
362+
let first = try XCTUnwrap(outputSpy.records.first)
363+
XCTAssertEqual(first[safe: 0], "Always bear in mind that your own resolütion to succeed is more important than any other.")
364+
XCTAssertEqual(first[safe: 1], "Abraham Lincoln")
365+
}
366+
352367
}

0 commit comments

Comments
 (0)