Skip to content

Commit 55aaf82

Browse files
committed
Quote struct field names
1 parent 931c5f9 commit 55aaf82

File tree

7 files changed

+103
-94
lines changed

7 files changed

+103
-94
lines changed

arrow-cast/src/cast/mod.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8605,7 +8605,7 @@ mod tests {
86058605
};
86068606
assert_eq!(
86078607
t,
8608-
r#"Casting from Map(Field { "entries": Struct(key Utf8, value nullable Utf8) }, false) to Map(Field { "entries": Struct(key Utf8, value Utf8) }, true) not supported"#
8608+
r#"Casting from Map(Field { "entries": Struct("key": Utf8, "value": nullable Utf8) }, false) to Map(Field { "entries": Struct("key": Utf8, "value": Utf8) }, true) not supported"#
86098609
);
86108610
}
86118611

@@ -8656,7 +8656,7 @@ mod tests {
86568656
};
86578657
assert_eq!(
86588658
t,
8659-
r#"Casting from Map(Field { "entries": Struct(key Utf8, value nullable Interval(DayTime)) }, false) to Map(Field { "entries": Struct(key Utf8, value Duration(Second)) }, true) not supported"#
8659+
r#"Casting from Map(Field { "entries": Struct("key": Utf8, "value": nullable Interval(DayTime)) }, false) to Map(Field { "entries": Struct("key": Utf8, "value": Duration(Second)) }, true) not supported"#
86608660
);
86618661
}
86628662

@@ -10748,7 +10748,7 @@ mod tests {
1074810748
let to_type = DataType::Utf8;
1074910749
let result = cast(&struct_array, &to_type);
1075010750
assert_eq!(
10751-
r#"Cast error: Casting from Struct(a Boolean) to Utf8 not supported"#,
10751+
r#"Cast error: Casting from Struct("a": Boolean) to Utf8 not supported"#,
1075210752
result.unwrap_err().to_string()
1075310753
);
1075410754
}
@@ -10759,7 +10759,7 @@ mod tests {
1075910759
let to_type = DataType::Struct(vec![Field::new("a", DataType::Boolean, false)].into());
1076010760
let result = cast(&array, &to_type);
1076110761
assert_eq!(
10762-
r#"Cast error: Casting from Utf8 to Struct(a Boolean) not supported"#,
10762+
r#"Cast error: Casting from Utf8 to Struct("a": Boolean) not supported"#,
1076310763
result.unwrap_err().to_string()
1076410764
);
1076510765
}

arrow-json/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ use serde_json::{Number, Value};
8787
///
8888
/// This enum controls which form(s) the Reader will accept and which form the
8989
/// Writer will produce. For example, if the RecordBatch Schema is
90-
/// `[("a", Int32), ("r", Struct(b Boolean, c Utf8))]`
90+
/// `[("a", Int32), ("r", Struct("b": Boolean, "c" Utf8))]`
9191
/// then a Reader with [`StructMode::ObjectOnly`] would read rows of the form
9292
/// `{"a": 1, "r": {"b": true, "c": "cat"}}` while with ['StructMode::ListOnly']
9393
/// would read rows of the form `[1, [true, "cat"]]`. A Writer would produce

arrow-row/src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2292,7 +2292,7 @@ mod tests {
22922292
let [s2] = back.try_into().unwrap();
22932293

22942294
// RowConverter flattens Dictionary
2295-
// s.ty = Struct(foo Dictionary(Int32, Utf8)), s2.ty = Struct(foo Utf8)
2295+
// s.ty = Struct("foo": Dictionary(Int32, Utf8)), s2.ty = Struct("foo": Utf8)
22962296
assert_ne!(&s.data_type(), &s2.data_type());
22972297
s2.to_data().validate_full().unwrap();
22982298

@@ -2340,7 +2340,7 @@ mod tests {
23402340
let [s2] = back.try_into().unwrap();
23412341

23422342
// RowConverter flattens Dictionary
2343-
// s.ty = Struct(foo Dictionary(Int32, Int32)), s2.ty = Struct(foo Int32)
2343+
// s.ty = Struct("foo": Dictionary(Int32, Int32)), s2.ty = Struct("foo": Int32)
23442344
assert_ne!(&s.data_type(), &s2.data_type());
23452345
s2.to_data().validate_full().unwrap();
23462346
assert_eq!(s.len(), 0);

arrow-schema/src/datatype_format.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ impl fmt::Debug for DataType {
126126
let maybe_nullable = if field.is_nullable() { "nullable " } else { "" };
127127
let data_type = field.data_type();
128128
let metadata_str = format_metadata(field.metadata());
129-
format!("{name} {maybe_nullable}{data_type}{metadata_str}")
129+
format!("{name:?}: {maybe_nullable}{data_type}{metadata_str}")
130130
})
131131
.collect::<Vec<_>>()
132132
.join(", ");

arrow-schema/src/datatype_parse.rs

Lines changed: 92 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,6 @@ impl<'a> Parser<'a> {
8181
Token::LargeList => self.parse_large_list(),
8282
Token::FixedSizeList => self.parse_fixed_size_list(),
8383
Token::Struct => self.parse_struct(),
84-
Token::FieldName(word) => {
85-
Err(make_error(self.val, &format!("unrecognized word: {word}")))
86-
}
8784
tok => Err(make_error(
8885
self.val,
8986
&format!("finding next type, got unexpected '{tok}'"),
@@ -154,15 +151,14 @@ impl<'a> Parser<'a> {
154151

155152
/// Parses the next double quoted string
156153
fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
157-
match self.next_token()? {
158-
Token::DoubleQuotedString(s) => Ok(s),
159-
Token::FieldName(word) => {
160-
Err(make_error(self.val, &format!("unrecognized word: {word}")))
161-
}
162-
tok => Err(make_error(
154+
let token = self.next_token()?;
155+
if let Token::DoubleQuotedString(string) = token {
156+
Ok(string)
157+
} else {
158+
Err(make_error(
163159
self.val,
164-
&format!("finding double quoted string for {context}, got '{tok}'"),
165-
)),
160+
&format!("expected double quoted string for {context}, got '{token}'"),
161+
))
166162
}
167163
}
168164

@@ -324,27 +320,22 @@ impl<'a> Parser<'a> {
324320
self.expect_token(Token::LParen)?;
325321
let mut fields = Vec::new();
326322
loop {
323+
// expects: "field name": [nullable] #datatype
324+
327325
let field_name = match self.next_token()? {
328-
// It's valid to have a name that is a type name
329-
Token::SimpleType(data_type) => data_type.to_string(),
330-
Token::FieldName(name) => name,
331326
Token::RParen => {
332-
if fields.is_empty() {
333-
break;
334-
} else {
335-
return Err(make_error(
336-
self.val,
337-
"Unexpected token while parsing Struct fields. Expected a word for the name of Struct, but got trailing comma",
338-
));
339-
}
327+
break;
340328
}
329+
Token::DoubleQuotedString(field_name) => field_name,
341330
tok => {
342331
return Err(make_error(
343332
self.val,
344-
&format!("Expected a word for the name of Struct, but got {tok}"),
333+
&format!("Expected a quoted string for a field name; got {tok:?}"),
345334
))
346335
}
347336
};
337+
self.expect_token(Token::Colon)?;
338+
348339
let nullable = self
349340
.tokenizer
350341
.next_if(|next| matches!(next, Ok(Token::Nullable)))
@@ -386,7 +377,7 @@ impl<'a> Parser<'a> {
386377

387378
/// returns true if this character is a separator
388379
fn is_separator(c: char) -> bool {
389-
c == '(' || c == ')' || c == ',' || c == ' '
380+
c == '(' || c == ')' || c == ',' || c == ':' || c == ' '
390381
}
391382

392383
#[derive(Debug)]
@@ -450,50 +441,6 @@ impl<'a> Tokenizer<'a> {
450441
})?;
451442
return Ok(Token::Integer(val));
452443
}
453-
// if it started with a double quote `"`, try parsing it as a double quoted string
454-
else if c == '"' {
455-
let len = self.word.chars().count();
456-
457-
// to verify it's double quoted
458-
if let Some(last_c) = self.word.chars().last() {
459-
if last_c != '"' || len < 2 {
460-
return Err(make_error(
461-
self.val,
462-
&format!(
463-
"parsing {} as double quoted string: last char must be \"",
464-
self.word
465-
),
466-
));
467-
}
468-
}
469-
470-
if len == 2 {
471-
return Err(make_error(
472-
self.val,
473-
&format!(
474-
"parsing {} as double quoted string: empty string isn't supported",
475-
self.word
476-
),
477-
));
478-
}
479-
480-
let val: String = self.word.parse().map_err(|e| {
481-
make_error(
482-
self.val,
483-
&format!("parsing {} as double quoted string: {e}", self.word),
484-
)
485-
})?;
486-
487-
let s = val[1..len - 1].to_string();
488-
if s.contains('"') {
489-
return Err(make_error(
490-
self.val,
491-
&format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word),
492-
));
493-
}
494-
495-
return Ok(Token::DoubleQuotedString(s));
496-
}
497444
}
498445

499446
// figure out what the word was
@@ -559,11 +506,63 @@ impl<'a> Tokenizer<'a> {
559506

560507
"Struct" => Token::Struct,
561508

562-
// If we don't recognize the word, treat it as a field name
563-
word => Token::FieldName(word.to_string()),
509+
token => {
510+
return Err(make_error(self.val, &format!("unknown token: {token}")));
511+
}
564512
};
565513
Ok(token)
566514
}
515+
516+
/// Parses e.g. `"foo bar"`
517+
fn parse_quoted_string(&mut self) -> ArrowResult<Token> {
518+
if self.next_char() != Some('\"') {
519+
return Err(make_error(self.val, "Expected \""));
520+
}
521+
522+
// reset temp space
523+
self.word.clear();
524+
525+
let mut is_escaped = false;
526+
527+
loop {
528+
match self.next_char() {
529+
None => {
530+
return Err(ArrowError::ParseError(format!(
531+
"Unterminated string at: \"{}",
532+
self.word
533+
)));
534+
}
535+
Some(c) => match c {
536+
'\\' => {
537+
is_escaped = true;
538+
self.word.push(c);
539+
}
540+
'"' => {
541+
if is_escaped {
542+
self.word.push(c);
543+
is_escaped = false;
544+
} else {
545+
break;
546+
}
547+
}
548+
c => {
549+
self.word.push(c);
550+
}
551+
},
552+
}
553+
}
554+
555+
let val: String = self.word.parse().map_err(|err| {
556+
ArrowError::ParseError(format!("Failed to parse string: \"{}\": {err}", self.word))
557+
})?;
558+
559+
if val.is_empty() {
560+
// Using empty strings as field names is just asking for trouble
561+
return Err(make_error(self.val, "empty strings aren't allowed"));
562+
}
563+
564+
Ok(Token::DoubleQuotedString(val))
565+
}
567566
}
568567

569568
impl Iterator for Tokenizer<'_> {
@@ -577,6 +576,9 @@ impl Iterator for Tokenizer<'_> {
577576
self.next_char();
578577
continue;
579578
}
579+
'"' => {
580+
return Some(self.parse_quoted_string());
581+
}
580582
'(' => {
581583
self.next_char();
582584
return Some(Ok(Token::LParen));
@@ -589,6 +591,10 @@ impl Iterator for Tokenizer<'_> {
589591
self.next_char();
590592
return Some(Ok(Token::Comma));
591593
}
594+
':' => {
595+
self.next_char();
596+
return Some(Ok(Token::Colon));
597+
}
592598
_ => return Some(self.parse_word()),
593599
}
594600
}
@@ -617,6 +623,7 @@ enum Token {
617623
LParen,
618624
RParen,
619625
Comma,
626+
Colon,
620627
Some,
621628
None,
622629
Integer(i64),
@@ -626,7 +633,6 @@ enum Token {
626633
FixedSizeList,
627634
Struct,
628635
Nullable,
629-
FieldName(String),
630636
}
631637

632638
impl Display for Token {
@@ -646,6 +652,7 @@ impl Display for Token {
646652
Token::LParen => write!(f, "("),
647653
Token::RParen => write!(f, ")"),
648654
Token::Comma => write!(f, ","),
655+
Token::Colon => write!(f, ":"),
649656
Token::Some => write!(f, "Some"),
650657
Token::None => write!(f, "None"),
651658
Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
@@ -658,7 +665,6 @@ impl Display for Token {
658665
Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
659666
Token::Struct => write!(f, "Struct"),
660667
Token::Nullable => write!(f, "nullable"),
661-
Token::FieldName(s) => write!(f, "FieldName({s})"),
662668
}
663669
}
664670
}
@@ -842,19 +848,19 @@ mod test {
842848
("Nu", "Unsupported type 'Nu'"),
843849
(
844850
r#"Timestamp(Nanosecond, Some(+00:00))"#,
845-
"Error unrecognized word: +00:00",
851+
"Error unknown token: +00",
846852
),
847853
(
848854
r#"Timestamp(Nanosecond, Some("+00:00))"#,
849-
r#"parsing "+00:00 as double quoted string: last char must be ""#,
855+
r#"Unterminated string at: "+00:00))"#,
850856
),
851857
(
852858
r#"Timestamp(Nanosecond, Some(""))"#,
853-
r#"parsing "" as double quoted string: empty string isn't supported"#,
859+
r#"empty strings aren't allowed"#,
854860
),
855861
(
856862
r#"Timestamp(Nanosecond, Some("+00:00""))"#,
857-
r#"parsing "+00:00"" as double quoted string: escaped double quote isn't supported"#,
863+
r#"Parser error: Unterminated string at: "))"#,
858864
),
859865
("Timestamp(Nanosecond, ", "Error finding next token"),
860866
(
@@ -876,9 +882,9 @@ mod test {
876882
("Decimal64(3, 500)", "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted"),
877883
("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"),
878884
("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"),
879-
("Struct(f1, Int64)", "Error finding next type, got unexpected ','"),
880-
("Struct(f1 Int64,)", "Expected a word for the name of Struct, but got trailing comma"),
881-
("Struct(f1)", "Error finding next type, got unexpected ')'"),
885+
("Struct(f1 Int64)", "Error unknown token: f1"),
886+
("Struct(\"f1\" Int64)", "Expected ':'"),
887+
("Struct(\"f1\": )", "Error finding next type, got unexpected ')'"),
882888
];
883889

884890
for (data_type_string, expected_message) in cases {
@@ -889,10 +895,13 @@ mod test {
889895
let message = e.to_string();
890896
assert!(
891897
message.contains(expected_message),
892-
"\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual:{message}\n"
898+
"\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual: {message}\n"
893899
);
894-
// errors should also contain a help message
895-
assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'"));
900+
901+
if !message.contains("Unterminated string") {
902+
// errors should also contain a help message
903+
assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'"), "message: {message}");
904+
}
896905
}
897906
}
898907
}
@@ -902,6 +911,6 @@ mod test {
902911
fn parse_error_type() {
903912
let err = parse_data_type("foobar").unwrap_err();
904913
assert!(matches!(err, ArrowError::ParseError(_)));
905-
assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error unrecognized word: foobar");
914+
assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error unknown token: foobar");
906915
}
907916
}

arrow-schema/src/schema.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -701,7 +701,7 @@ mod tests {
701701
schema.to_string(),
702702
"Field { \"first_name\": Utf8, metadata: {\"k\": \"v\"} }, \
703703
Field { \"last_name\": Utf8 }, \
704-
Field { \"address\": Struct(street Utf8, zip UInt16) }, \
704+
Field { \"address\": Struct(\"street\": Utf8, \"zip\": UInt16) }, \
705705
Field { \"interests\": nullable Dictionary(Int32, Utf8), dict_id: 123, dict_is_ordered }"
706706
)
707707
}

0 commit comments

Comments
 (0)