Skip to content

Commit a3e46a2

Browse files
Correctly url encode emoji in path segments
The previous implementation borked on emoji because invoking char.toString on a single UTF-8 part of a larger UTF-16 pair results in the encoding presenting "?" as the value. This implementation works primarily on Bytes and avoids having to invoke char.toString and therefore is capable of correctly encoding emoji characters into a UTF-8 url encoded path segment. This did involve re-working some of the valid character detection for path segments, so there is likely a delta to the overall performance, but I think it should be negligible.
1 parent 0e4eb5d commit a3e46a2

File tree

2 files changed

+29
-21
lines changed

2 files changed

+29
-21
lines changed

core/src/main/scala/uri.scala

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -50,30 +50,33 @@ object UriEncode {
5050
def pchar = unreserved ++ (
5151
':' :: '@' :: '&' :: '=' :: '+' :: '$' :: ',' :: Nil
5252
)
53-
val segmentValid = (';' +: pchar).toSet
53+
val segmentValid: Set[Char] = (';' +: pchar).toSet
5454

55-
private val validMarkers = (0 to segmentValid.max.toInt).map(i => segmentValid(i.toChar)).toArray
56-
private def isValidChar(ch: Char) = (ch < validMarkers.length) && validMarkers(ch.toInt)
55+
// There are likely more optimal ways of doing this calculation, however
56+
// it seems unlikely that long path segments are often on the hot path
57+
// of a request in such a way that they can't be cached. If that proves
58+
// not to be true, then we can revisit.
59+
private def isValidChar(b: Byte) = {
60+
segmentValid.contains(b.toChar)
61+
}
5762

5863
def path(pathSegment: String, encoding: String = "UTF-8") = {
59-
if (pathSegment.forall(isValidChar)) {
64+
val pathBytes = pathSegment.getBytes(encoding)
65+
66+
if (pathBytes.forall(isValidChar)) {
6067
pathSegment
61-
}
62-
else {
68+
} else {
6369
val sb = new StringBuilder(pathSegment.length << 1)
6470

65-
pathSegment foreach { ch =>
66-
if (isValidChar(ch)) {
67-
sb.append(ch)
68-
}
69-
else {
70-
ch.toString.getBytes(encoding) foreach { b =>
71-
val hi = (b >>> 4) & 0xf
72-
val lo = b & 0xf
73-
sb.append('%')
74-
.append((if (hi > 9) hi + '7' else hi + '0').toChar)
75-
.append((if (lo > 9) lo + '7' else lo + '0').toChar)
76-
}
71+
pathBytes.foreach { b =>
72+
if (isValidChar(b)) {
73+
sb.append(b.toChar)
74+
} else {
75+
val hi = (b >>> 4) & 0xf
76+
val lo = b & 0xf
77+
sb.append('%')
78+
.append((if (hi > 9) hi + '7' else hi + '0').toChar)
79+
.append((if (lo > 9) lo + '7' else lo + '0').toChar)
7780
}
7881
}
7982

core/src/test/scala/uri.scala

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,26 @@
11
package dispatch.spec
22

33
import org.scalacheck._
4-
import org.scalacheck.Prop.BooleanOperators
4+
import org.scalacheck.Prop._
55

66
object UriSpecification extends Properties("Uri") {
77
/** java.net.URLDecoder should *NOT* be used for testing URI segment decoding
88
* because it implements completely different functionality: query parameter decoding
99
*/
10-
property("encode-decode") = Prop.forAll { (path: String) =>
10+
property("Encodes and decodes basic strings") = Prop.forAll { (path: String) =>
1111
!path.contains(":") ==> {
1212
new java.net.URI(dispatch.UriEncode.path(path)).getPath == path
1313
} // else Prop.throws(classOf[java.net.URISyntaxException])
1414
}
1515

1616
/** if there is nothing to escape, encoder must return original reference */
17-
property("noop") = Prop.forAll(Gen.choose(0,100)) { (n: Int) =>
17+
property("Does nothing if there's nothing eo encode") = Prop.forAll(Gen.choose(0,100)) { (n: Int) =>
1818
val path = "A" * n
1919
dispatch.UriEncode.path(path) eq path
2020
}
21+
22+
property("Encodes emoji correctly") = forAll(Gen.const("unused")) { (sample: String) =>
23+
val path = "roma🇮🇹"
24+
new java.net.URI(dispatch.UriEncode.path(path)).getPath == (path)
25+
}
2126
}

0 commit comments

Comments
 (0)