/
StringUtil.java
367 lines (328 loc) · 13.3 KB
/
StringUtil.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
package org.jsoup.internal;
import org.jsoup.helper.Validate;
import javax.annotation.Nullable;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.Stack;
import java.util.regex.Pattern;
/**
A minimal String utility class. Designed for <b>internal</b> jsoup use only - the API and outcome may change without
notice.
*/
public final class StringUtil {
// memoised padding up to 21 (blocks 0 to 20 spaces)
static final String[] padding = {"", " ", " ", " ", " ", " ", " ", " ", " ",
" ", " ", " ", " ", " ", " ", " ",
" ", " ", " ", " ", " "};
/**
* Join a collection of strings by a separator
* @param strings collection of string objects
* @param sep string to place between strings
* @return joined string
*/
public static String join(Collection<?> strings, String sep) {
return join(strings.iterator(), sep);
}
/**
* Join a collection of strings by a separator
* @param strings iterator of string objects
* @param sep string to place between strings
* @return joined string
*/
public static String join(Iterator<?> strings, String sep) {
if (!strings.hasNext())
return "";
String start = strings.next().toString();
if (!strings.hasNext()) // only one, avoid builder
return start;
StringJoiner j = new StringJoiner(sep);
j.add(start);
while (strings.hasNext()) {
j.add(strings.next());
}
return j.complete();
}
/**
* Join an array of strings by a separator
* @param strings collection of string objects
* @param sep string to place between strings
* @return joined string
*/
public static String join(String[] strings, String sep) {
return join(Arrays.asList(strings), sep);
}
/**
A StringJoiner allows incremental / filtered joining of a set of stringable objects.
@since 1.14.1
*/
public static class StringJoiner {
@Nullable StringBuilder sb = borrowBuilder(); // sets null on builder release so can't accidentally be reused
final String separator;
boolean first = true;
/**
Create a new joiner, that uses the specified separator. MUST call {@link #complete()} or will leak a thread
local string builder.
@param separator the token to insert between strings
*/
public StringJoiner(String separator) {
this.separator = separator;
}
/**
Add another item to the joiner, will be separated
*/
public StringJoiner add(Object stringy) {
Validate.notNull(sb); // don't reuse
if (!first)
sb.append(separator);
sb.append(stringy);
first = false;
return this;
}
/**
Append content to the current item; not separated
*/
public StringJoiner append(Object stringy) {
Validate.notNull(sb); // don't reuse
sb.append(stringy);
return this;
}
/**
Return the joined string, and release the builder back to the pool. This joiner cannot be reused.
*/
public String complete() {
String string = releaseBuilder(sb);
sb = null;
return string;
}
}
/**
* Returns space padding (up to a max of 30). Use {@link #padding(int, int)} if you need
* more than 30 spaces padded.
* @param width amount of padding desired
* @return string of spaces * width
*/
public static String padding(int width) {
return padding(width, 30);
}
/**
* Returns space padding (up to a max of maxPaddingWidth - default 30).
* @param width amount of padding desired
* @param maxPaddingWidth amount of max space padding with default set at 30 and -1 means no max.
* @return string of spaces * width
*/
public static String padding(int width, int maxPaddingWidth) {
if (width < 0)
throw new IllegalArgumentException("width must be > 0");
if (width < padding.length)
return padding[width];
if (maxPaddingWidth != -1)
width = Math.min(width, maxPaddingWidth);
char[] out = new char[width];
for (int i = 0; i < width; i++)
out[i] = ' ';
return String.valueOf(out);
}
/**
* Tests if a string is blank: null, empty, or only whitespace (" ", \r\n, \t, etc)
* @param string string to test
* @return if string is blank
*/
public static boolean isBlank(String string) {
if (string == null || string.length() == 0)
return true;
int l = string.length();
for (int i = 0; i < l; i++) {
if (!StringUtil.isWhitespace(string.codePointAt(i)))
return false;
}
return true;
}
/**
* Tests if a string is numeric, i.e. contains only digit characters
* @param string string to test
* @return true if only digit chars, false if empty or null or contains non-digit chars
*/
public static boolean isNumeric(String string) {
if (string == null || string.length() == 0)
return false;
int l = string.length();
for (int i = 0; i < l; i++) {
if (!Character.isDigit(string.codePointAt(i)))
return false;
}
return true;
}
/**
* Tests if a code point is "whitespace" as defined in the HTML spec. Used for output HTML.
* @param c code point to test
* @return true if code point is whitespace, false otherwise
* @see #isActuallyWhitespace(int)
*/
public static boolean isWhitespace(int c){
return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r';
}
/**
* Tests if a code point is "whitespace" as defined by what it looks like. Used for Element.text etc.
* @param c code point to test
* @return true if code point is whitespace, false otherwise
*/
public static boolean isActuallyWhitespace(int c){
return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r' || c == 160;
// 160 is (non-breaking space). Not in the spec but expected.
}
public static boolean isInvisibleChar(int c) {
return c == 8203 || c == 173; // zero width sp, soft hyphen
// previously also included zw non join, zw join - but removing those breaks semantic meaning of text
}
/**
* Normalise the whitespace within this string; multiple spaces collapse to a single, and all whitespace characters
* (e.g. newline, tab) convert to a simple space.
* @param string content to normalise
* @return normalised string
*/
public static String normaliseWhitespace(String string) {
StringBuilder sb = StringUtil.borrowBuilder();
appendNormalisedWhitespace(sb, string, false);
return StringUtil.releaseBuilder(sb);
}
/**
* After normalizing the whitespace within a string, appends it to a string builder.
* @param accum builder to append to
* @param string string to normalize whitespace within
* @param stripLeading set to true if you wish to remove any leading whitespace
*/
public static void appendNormalisedWhitespace(StringBuilder accum, String string, boolean stripLeading) {
boolean lastWasWhite = false;
boolean reachedNonWhite = false;
int len = string.length();
int c;
for (int i = 0; i < len; i+= Character.charCount(c)) {
c = string.codePointAt(i);
if (isActuallyWhitespace(c)) {
if ((stripLeading && !reachedNonWhite) || lastWasWhite)
continue;
accum.append(' ');
lastWasWhite = true;
}
else if (!isInvisibleChar(c)) {
accum.appendCodePoint(c);
lastWasWhite = false;
reachedNonWhite = true;
}
}
}
public static boolean in(final String needle, final String... haystack) {
final int len = haystack.length;
for (int i = 0; i < len; i++) {
if (haystack[i].equals(needle))
return true;
}
return false;
}
public static boolean inSorted(String needle, String[] haystack) {
return Arrays.binarySearch(haystack, needle) >= 0;
}
/**
Tests that a String contains only ASCII characters.
@param string scanned string
@return true if all characters are in range 0 - 127
*/
public static boolean isAscii(String string) {
Validate.notNull(string);
for (int i = 0; i < string.length(); i++) {
int c = string.charAt(i);
if (c > 127) { // ascii range
return false;
}
}
return true;
}
private static final Pattern extraDotSegmentsPattern = Pattern.compile("^/((\\.{1,2}/)+)");
/**
* Create a new absolute URL, from a provided existing absolute URL and a relative URL component.
* @param base the existing absolute base URL
* @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned)
* @return the resolved absolute URL
* @throws MalformedURLException if an error occurred generating the URL
*/
public static URL resolve(URL base, String relUrl) throws MalformedURLException {
// workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
if (relUrl.startsWith("?"))
relUrl = base.getPath() + relUrl;
// workaround: //example.com + ./foo = //example.com/./foo, not //example.com/foo
URL url = new URL(base, relUrl);
String fixedFile = extraDotSegmentsPattern.matcher(url.getFile()).replaceFirst("/");
if (url.getRef() != null) {
fixedFile = fixedFile + "#" + url.getRef();
}
return new URL(url.getProtocol(), url.getHost(), url.getPort(), fixedFile);
}
/**
* Create a new absolute URL, from a provided existing absolute URL and a relative URL component.
* @param baseUrl the existing absolute base URL
* @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned)
* @return an absolute URL if one was able to be generated, or the empty string if not
*/
public static String resolve(final String baseUrl, final String relUrl) {
try {
URL base;
try {
base = new URL(baseUrl);
} catch (MalformedURLException e) {
// the base is unsuitable, but the attribute/rel may be abs on its own, so try that
URL abs = new URL(relUrl);
return abs.toExternalForm();
}
return resolve(base, relUrl).toExternalForm();
} catch (MalformedURLException e) {
// it may still be valid, just that Java doesn't have a registered stream handler for it, e.g. tel
// we test here vs at start to normalize supported URLs (e.g. HTTP -> http)
return validUriScheme.matcher(relUrl).find() ? relUrl : "";
}
}
private static final Pattern validUriScheme = Pattern.compile("^[a-zA-Z][a-zA-Z0-9+-.]*:");
private static final ThreadLocal<Stack<StringBuilder>> threadLocalBuilders = new ThreadLocal<Stack<StringBuilder>>() {
@Override
protected Stack<StringBuilder> initialValue() {
return new Stack<>();
}
};
/**
* Maintains cached StringBuilders in a flyweight pattern, to minimize new StringBuilder GCs. The StringBuilder is
* prevented from growing too large.
* <p>
* Care must be taken to release the builder once its work has been completed, with {@link #releaseBuilder}
* @return an empty StringBuilder
*/
public static StringBuilder borrowBuilder() {
Stack<StringBuilder> builders = threadLocalBuilders.get();
return builders.empty() ?
new StringBuilder(MaxCachedBuilderSize) :
builders.pop();
}
/**
* Release a borrowed builder. Care must be taken not to use the builder after it has been returned, as its
* contents may be changed by this method, or by a concurrent thread.
* @param sb the StringBuilder to release.
* @return the string value of the released String Builder (as an incentive to release it!).
*/
public static String releaseBuilder(StringBuilder sb) {
Validate.notNull(sb);
String string = sb.toString();
if (sb.length() > MaxCachedBuilderSize)
sb = new StringBuilder(MaxCachedBuilderSize); // make sure it hasn't grown too big
else
sb.delete(0, sb.length()); // make sure it's emptied on release
Stack<StringBuilder> builders = threadLocalBuilders.get();
builders.push(sb);
while (builders.size() > MaxIdleBuilders) {
builders.pop();
}
return string;
}
private static final int MaxCachedBuilderSize = 8 * 1024;
private static final int MaxIdleBuilders = 8;
}