View Javadoc
1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.hc.core5.net;
29  
30  import java.net.URI;
31  import java.nio.ByteBuffer;
32  import java.nio.CharBuffer;
33  import java.nio.charset.Charset;
34  import java.nio.charset.StandardCharsets;
35  import java.util.ArrayList;
36  import java.util.Arrays;
37  import java.util.BitSet;
38  import java.util.Collections;
39  import java.util.List;
40  
41  import org.apache.hc.core5.http.NameValuePair;
42  import org.apache.hc.core5.http.message.BasicNameValuePair;
43  import org.apache.hc.core5.http.message.ParserCursor;
44  import org.apache.hc.core5.http.message.TokenParser;
45  import org.apache.hc.core5.util.Args;
46  
47  /**
48   * A collection of utilities for encoding URLs.
49   *
50   * @since 4.0
51   */
52  public class URLEncodedUtils {
53  
54      private static final char QP_SEP_A = '&';
55      private static final char QP_SEP_S = ';';
56      private static final String NAME_VALUE_SEPARATOR = "=";
57      private static final char PATH_SEPARATOR = '/';
58  
59      private static final BitSet PATH_SEPARATORS     = new BitSet(256);
60      static {
61          PATH_SEPARATORS.set(PATH_SEPARATOR);
62      }
63  
64      /**
65       * Returns a list of {@link NameValuePair}s URI query parameters.
66       * By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
67       *
68       * @param uri input URI.
69       * @param charset parameter charset.
70       * @return list of query parameters.
71       *
72       * @since 4.5
73       */
74      public static List<NameValuePair> parse(final URI uri, final Charset charset) {
75          Args.notNull(uri, "URI");
76          final String query = uri.getRawQuery();
77          if (query != null && !query.isEmpty()) {
78              return parse(query, charset);
79          }
80          return createEmptyList();
81      }
82  
83      /**
84       * Returns a list of {@link NameValuePair}s URI query parameters.
85       * By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
86       *
87       * @param s URI query component.
88       * @param charset charset to use when decoding the parameters.
89       * @return list of query parameters.
90       *
91       * @since 4.2
92       */
93      public static List<NameValuePair> parse(final CharSequence s, final Charset charset) {
94          if (s == null) {
95              return createEmptyList();
96          }
97          return parse(s, charset, QP_SEP_A, QP_SEP_S);
98      }
99  
100     /**
101      * Returns a list of {@link NameValuePair}s parameters.
102      *
103      * @param s input text.
104      * @param charset parameter charset.
105      * @param separators parameter separators.
106      * @return list of query parameters.
107      *
108      * @since 4.4
109      */
110     public static List<NameValuePair> parse(
111             final CharSequence s, final Charset charset, final char... separators) {
112         Args.notNull(s, "Char sequence");
113         final TokenParser tokenParser = TokenParser.INSTANCE;
114         final BitSet delimSet = new BitSet();
115         for (final char separator: separators) {
116             delimSet.set(separator);
117         }
118         final ParserCursore/ParserCursor.html#ParserCursor">ParserCursor cursor = new ParserCursor(0, s.length());
119         final List<NameValuePair> list = new ArrayList<>();
120         while (!cursor.atEnd()) {
121             delimSet.set('=');
122             final String name = tokenParser.parseToken(s, cursor, delimSet);
123             String value = null;
124             if (!cursor.atEnd()) {
125                 final int delim = s.charAt(cursor.getPos());
126                 cursor.updatePos(cursor.getPos() + 1);
127                 if (delim == '=') {
128                     delimSet.clear('=');
129                     value = tokenParser.parseToken(s, cursor, delimSet);
130                     if (!cursor.atEnd()) {
131                         cursor.updatePos(cursor.getPos() + 1);
132                     }
133                 }
134             }
135             if (!name.isEmpty()) {
136                 list.add(new BasicNameValuePair(
137                         decodeFormFields(name, charset),
138                         decodeFormFields(value, charset)));
139             }
140         }
141         return list;
142     }
143 
144     static List<String> splitSegments(final CharSequence s, final BitSet separators) {
145         final ParserCursore/ParserCursor.html#ParserCursor">ParserCursor cursor = new ParserCursor(0, s.length());
146         // Skip leading separator
147         if (cursor.atEnd()) {
148             return Collections.emptyList();
149         }
150         if (separators.get(s.charAt(cursor.getPos()))) {
151             cursor.updatePos(cursor.getPos() + 1);
152         }
153         final List<String> list = new ArrayList<>();
154         final StringBuilder buf = new StringBuilder();
155         for (;;) {
156             if (cursor.atEnd()) {
157                 list.add(buf.toString());
158                 break;
159             }
160             final char current = s.charAt(cursor.getPos());
161             if (separators.get(current)) {
162                 list.add(buf.toString());
163                 buf.setLength(0);
164             } else {
165                 buf.append(current);
166             }
167             cursor.updatePos(cursor.getPos() + 1);
168         }
169         return list;
170     }
171 
172     static List<String> splitPathSegments(final CharSequence s) {
173         return splitSegments(s, PATH_SEPARATORS);
174     }
175 
176     /**
177      * Returns a list of URI path segments.
178      *
179      * @param s URI path component.
180      * @param charset parameter charset.
181      * @return list of segments.
182      *
183      * @since 4.5
184      */
185     public static List<String> parsePathSegments(final CharSequence s, final Charset charset) {
186         Args.notNull(s, "Char sequence");
187         final List<String> list = splitPathSegments(s);
188         for (int i = 0; i < list.size(); i++) {
189             list.set(i, urlDecode(list.get(i), charset != null ? charset : StandardCharsets.UTF_8, false));
190         }
191         return list;
192     }
193 
194     /**
195      * Returns a list of URI path segments.
196      *
197      * @param s URI path component.
198      * @return list of segments.
199      *
200      * @since 4.5
201      */
202     public static List<String> parsePathSegments(final CharSequence s) {
203         return parsePathSegments(s, StandardCharsets.UTF_8);
204     }
205 
206     static void formatSegments(final StringBuilder buf, final Iterable<String> segments, final Charset charset) {
207         for (final String segment : segments) {
208             buf.append(PATH_SEPARATOR);
209             urlEncode(buf, segment, charset, PATHSAFE, false);
210         }
211     }
212 
213     /**
214      * Returns a string consisting of joint encoded path segments.
215      *
216      * @param segments the segments.
217      * @param charset parameter charset.
218      * @return URI path component
219      *
220      * @since 4.5
221      */
222     public static String formatSegments(final Iterable<String> segments, final Charset charset) {
223         Args.notNull(segments, "Segments");
224         final StringBuilder buf = new StringBuilder();
225         formatSegments(buf, segments, charset);
226         return buf.toString();
227     }
228 
229     /**
230      * Returns a string consisting of joint encoded path segments.
231      *
232      * @param segments the segments.
233      * @return URI path component
234      *
235      * @since 4.5
236      */
237     public static String formatSegments(final String... segments) {
238         return formatSegments(Arrays.asList(segments), StandardCharsets.UTF_8);
239     }
240 
241     static void formatNameValuePairs(
242             final StringBuilder buf,
243             final Iterable<? extends NameValuePair> parameters,
244             final char parameterSeparator,
245             final Charset charset) {
246         int i = 0;
247         for (final NameValuePair parameter : parameters) {
248             if (i > 0) {
249                 buf.append(parameterSeparator);
250             }
251             encodeFormFields(buf, parameter.getName(), charset);
252             if (parameter.getValue() != null) {
253                 buf.append(NAME_VALUE_SEPARATOR);
254                 encodeFormFields(buf, parameter.getValue(), charset);
255             }
256             i++;
257         }
258     }
259 
260     static void formatParameters(
261             final StringBuilder buf,
262             final Iterable<? extends NameValuePair> parameters,
263             final Charset charset) {
264         formatNameValuePairs(buf, parameters, QP_SEP_A, charset);
265     }
266 
267     /**
268      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
269      * list of parameters in an HTTP PUT or HTTP POST.
270      *
271      * @param parameters  The parameters to include.
272      * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}.
273      * @param charset The encoding to use.
274      * @return An {@code application/x-www-form-urlencoded} string
275      *
276      * @since 4.3
277      */
278     public static String format(
279             final Iterable<? extends NameValuePair> parameters,
280             final char parameterSeparator,
281             final Charset charset) {
282         Args.notNull(parameters, "Parameters");
283         final StringBuilder buf = new StringBuilder();
284         formatNameValuePairs(buf, parameters, parameterSeparator, charset);
285         return buf.toString();
286     }
287 
288     /**
289      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
290      * list of parameters in an HTTP PUT or HTTP POST.
291      *
292      * @param parameters  The parameters to include.
293      * @param charset The encoding to use.
294      * @return An {@code application/x-www-form-urlencoded} string
295      *
296      * @since 4.2
297      */
298     public static String format(
299             final Iterable<? extends NameValuePair> parameters,
300             final Charset charset) {
301         return format(parameters, QP_SEP_A, charset);
302     }
303 
304     /**
305      * Unreserved characters, i.e. alphanumeric, plus: {@code _ - ! . ~ ' ( ) *}
306      * <p>
307      *  This list is the same as the {@code unreserved} list in
308      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
309      */
310     private static final BitSet UNRESERVED   = new BitSet(256);
311     /**
312      * Punctuation characters: , ; : $ & + =
313      * <p>
314      * These are the additional characters allowed by userinfo.
315      */
316     private static final BitSet PUNCT        = new BitSet(256);
317     /** Characters which are safe to use in userinfo,
318      * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation */
319     private static final BitSet USERINFO     = new BitSet(256);
320     /** Characters which are safe to use in a path,
321      * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation plus / @ */
322     private static final BitSet PATHSAFE     = new BitSet(256);
323     /** Characters which are safe to use in a query or a fragment,
324      * i.e. {@link #RESERVED} plus {@link #UNRESERVED} */
325     private static final BitSet URIC     = new BitSet(256);
326 
327     /**
328      * Reserved characters, i.e. {@code ;/?:@&=+$,[]}
329      * <p>
330      *  This list is the same as the {@code reserved} list in
331      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
332      *  as augmented by
333      *  <a href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>
334      */
335     private static final BitSet RESERVED     = new BitSet(256);
336 
337 
338     /**
339      * Safe characters for x-www-form-urlencoded data, as per java.net.URLEncoder and browser behaviour,
340      * i.e. alphanumeric plus {@code "-", "_", ".", "*"}
341      */
342     private static final BitSet URLENCODER   = new BitSet(256);
343 
344     private static final BitSet PATH_SPECIAL = new BitSet(256);
345 
346     static {
347         // unreserved chars
348         // alpha characters
349         for (int i = 'a'; i <= 'z'; i++) {
350             UNRESERVED.set(i);
351         }
352         for (int i = 'A'; i <= 'Z'; i++) {
353             UNRESERVED.set(i);
354         }
355         // numeric characters
356         for (int i = '0'; i <= '9'; i++) {
357             UNRESERVED.set(i);
358         }
359         UNRESERVED.set('_'); // these are the charactes of the "mark" list
360         UNRESERVED.set('-');
361         UNRESERVED.set('.');
362         UNRESERVED.set('*');
363         URLENCODER.or(UNRESERVED); // skip remaining unreserved characters
364         UNRESERVED.set('!');
365         UNRESERVED.set('~');
366         UNRESERVED.set('\'');
367         UNRESERVED.set('(');
368         UNRESERVED.set(')');
369         // punct chars
370         PUNCT.set(',');
371         PUNCT.set(';');
372         PUNCT.set(':');
373         PUNCT.set('$');
374         PUNCT.set('&');
375         PUNCT.set('+');
376         PUNCT.set('=');
377         // Safe for userinfo
378         USERINFO.or(UNRESERVED);
379         USERINFO.or(PUNCT);
380 
381         // URL path safe
382         PATHSAFE.or(UNRESERVED);
383         PATHSAFE.set(';'); // param separator
384         PATHSAFE.set(':'); // RFC 2396
385         PATHSAFE.set('@');
386         PATHSAFE.set('&');
387         PATHSAFE.set('=');
388         PATHSAFE.set('+');
389         PATHSAFE.set('$');
390         PATHSAFE.set(',');
391 
392         PATH_SPECIAL.or(PATHSAFE);
393         PATH_SPECIAL.set('/');
394 
395         RESERVED.set(';');
396         RESERVED.set('/');
397         RESERVED.set('?');
398         RESERVED.set(':');
399         RESERVED.set('@');
400         RESERVED.set('&');
401         RESERVED.set('=');
402         RESERVED.set('+');
403         RESERVED.set('$');
404         RESERVED.set(',');
405         RESERVED.set('['); // added by RFC 2732
406         RESERVED.set(']'); // added by RFC 2732
407 
408         URIC.or(RESERVED);
409         URIC.or(UNRESERVED);
410     }
411 
412     private static final int RADIX = 16;
413 
414     private static List<NameValuePair> createEmptyList() {
415         return new ArrayList<>(0);
416     }
417 
418     private static void urlEncode(
419             final StringBuilder buf,
420             final String content,
421             final Charset charset,
422             final BitSet safechars,
423             final boolean blankAsPlus) {
424         if (content == null) {
425             return;
426         }
427         final ByteBuffer bb = charset.encode(content);
428         while (bb.hasRemaining()) {
429             final int b = bb.get() & 0xff;
430             if (safechars.get(b)) {
431                 buf.append((char) b);
432             } else if (blankAsPlus && b == ' ') {
433                 buf.append('+');
434             } else {
435                 buf.append("%");
436                 final char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, RADIX));
437                 final char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, RADIX));
438                 buf.append(hex1);
439                 buf.append(hex2);
440             }
441         }
442     }
443 
444     private static String urlDecode(
445             final String content,
446             final Charset charset,
447             final boolean plusAsBlank) {
448         if (content == null) {
449             return null;
450         }
451         final ByteBuffer bb = ByteBuffer.allocate(content.length());
452         final CharBuffer cb = CharBuffer.wrap(content);
453         while (cb.hasRemaining()) {
454             final char c = cb.get();
455             if (c == '%' && cb.remaining() >= 2) {
456                 final char uc = cb.get();
457                 final char lc = cb.get();
458                 final int u = Character.digit(uc, 16);
459                 final int l = Character.digit(lc, 16);
460                 if (u != -1 && l != -1) {
461                     bb.put((byte) ((u << 4) + l));
462                 } else {
463                     bb.put((byte) '%');
464                     bb.put((byte) uc);
465                     bb.put((byte) lc);
466                 }
467             } else if (plusAsBlank && c == '+') {
468                 bb.put((byte) ' ');
469             } else {
470                 bb.put((byte) c);
471             }
472         }
473         bb.flip();
474         return charset.decode(bb).toString();
475     }
476 
477     static String decodeFormFields(final String content, final Charset charset) {
478         if (content == null) {
479             return null;
480         }
481         return urlDecode(content, charset != null ? charset : StandardCharsets.UTF_8, true);
482     }
483 
484     static void encodeFormFields(final StringBuilder buf, final String content, final Charset charset) {
485         if (content == null) {
486             return;
487         }
488         urlEncode(buf, content, charset != null ? charset : StandardCharsets.UTF_8, URLENCODER, true);
489     }
490 
491     static void encUserInfo(final StringBuilder buf, final String content, final Charset charset) {
492         urlEncode(buf, content, charset != null ? charset : StandardCharsets.UTF_8, USERINFO, false);
493     }
494 
495     static void encUric(final StringBuilder buf, final String content, final Charset charset) {
496         urlEncode(buf, content, charset != null ? charset : StandardCharsets.UTF_8, URIC, false);
497     }
498 
499 }