GlueGen v2.6.0-rc-20250712
GlueGen, Native Binding Generator for Java™ (public API).
StringUtil.java
Go to the documentation of this file.
1/**
2 * Copyright 2014 JogAmp Community. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without modification, are
5 * permitted provided that the following conditions are met:
6 *
7 * 1. Redistributions of source code must retain the above copyright notice, this list of
8 * conditions and the following disclaimer.
9 *
10 * 2. Redistributions in binary form must reproduce the above copyright notice, this list
11 * of conditions and the following disclaimer in the documentation and/or other materials
12 * provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY JogAmp Community ``AS IS'' AND ANY EXPRESS OR IMPLIED
15 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JogAmp Community OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
19 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
22 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 *
24 * The views and conclusions contained in the software and documentation are those of the
25 * authors and should not be interpreted as representing official policies, either expressed
26 * or implied, of JogAmp Community.
27 */
28package com.jogamp.common.util;
29
30import java.util.ArrayList;
31import java.util.List;
32
33/** Basic utility functions for {@link String} and {@link CharSequence} in general. */
34public class StringUtil {
35 /** Linefeed character unicode {@code '\n'}, 0x000A. */
36 public static final char LF = '\n';
37 /** CR character unicode {@code '\r'}, 0x000D. */
38 public static final char CR = '\r';
39 /** Space character unicode {@code ' '}, 0x0020. */
40 public static final char SPACE = ' ';
41
42 /**
43 * List of ASCII & Unicode space separator, aka {@code Whitespace}.
44 * @see https://www.compart.com/en/unicode/category/Zs
45 * @see https://en.wikipedia.org/wiki/Whitespace_character
46 * @see https://www.unicode.org/reports/tr44/#General_Category_Values
47 * @see ftp://ftp.unicode.org/Public/UNIDATA/Scripts.txt
48 * @see https://www.w3schools.com/charsets/ref_utf_punctuation.asp
49 */
50 public static final String WHITESPACE =
51 String.valueOf("\t")+ // char tabulator
52 String.valueOf(LF)+ // LF 0x000A
53 String.valueOf(Character.toChars(0x000B))+ // line tab
54 String.valueOf(Character.toChars(0x000C))+ // FF
55 String.valueOf(CR)+ // CR 0x000D
56 String.valueOf(SPACE)+ // SPACE 0x0020
57 String.valueOf(Character.toChars(0x0085))+ // Next Line
58 String.valueOf(Character.toChars(0x00A0))+ // No-break space (NBSP)
59 String.valueOf(Character.toChars(0x1680))+ // Ogham Space Mark
60 String.valueOf(Character.toChars(0x2000))+ // En Quad
61 String.valueOf(Character.toChars(0x2001))+ // Em Quad
62 String.valueOf(Character.toChars(0x2002))+ // En Space
63 String.valueOf(Character.toChars(0x2003))+ // Em Space
64 String.valueOf(Character.toChars(0x2004))+ // Three-Per Em-Space
65 String.valueOf(Character.toChars(0x2005))+ // Four-Per Em-Space
66 String.valueOf(Character.toChars(0x2006))+ // Six-Per Em-Space
67 String.valueOf(Character.toChars(0x2007))+ // Figure-Space
68 String.valueOf(Character.toChars(0x2008))+ // Punctuation-Space
69 String.valueOf(Character.toChars(0x2009))+ // Thin-Space
70 String.valueOf(Character.toChars(0x200A))+ // Hair-Space
71 String.valueOf(Character.toChars(0x202F))+ // Narrow No-break space (NNBSP)
72 String.valueOf(Character.toChars(0x205F))+ // Medium Mathematical space (MMSP)
73 String.valueOf(Character.toChars(0x3000)); // Ideographic Space
74
75
76 /** Return true if given codepoint in included within {@link #WHITESPACE}. */
77 public static boolean isWhitespace(final int cp) {
78 return 0 <= WHITESPACE.indexOf(cp);
79 }
80
81 /**
82 * Returns true if given codepoint is a fullwidth unicode character.
83 * @see https://www.compart.com/en/unicode/block/U+FF00
84 */
85 public static boolean isFullwidth(final int cp) {
86 return 0xff01 <= cp && cp <= 0xff60;
87 }
88 /**
89 * Returns true if given codepoint is a halfwidth unicode character.
90 * @see https://www.compart.com/en/unicode/block/U+FF00
91 */
92 public static boolean isHalfwidth(final int cp) {
93 return 0xff61 == cp && cp <= 0xffee;
94 }
95
96 /** Returns true if given codepoint is either {@link #isFullwidth(int)}, {@link #isHalfwidth(int)} or {@link #isWhitespace(int)}. */
97 public static boolean hasSpace(final int cp) {
98 return isFullwidth(cp) || isHalfwidth(cp) || 0 <= WHITESPACE.indexOf(cp);
99 }
100
101 /** Returns number of lines, i.e. number of non-empty lines, separated by {@link #LF}. */
102 public static int getLineCount(final CharSequence s) {
103 if( null == s ) {
104 return 0;
105 }
106 final int len = s.length();
107 if( 0 == len ) {
108 return 0;
109 }
110 int lc = 0;
111 for (int i=0; len > i; ) {
112 ++lc;
113 final int j = indexOf(s, LF, i);
114 if ( 0 > j ) {
115 break;
116 }
117 i = j + 1;
118 }
119 return lc;
120 }
121
122 /**
123 * Calls {@link String#indexOf(int, int)}
124 * @param hay the unicode character string to search in from {@code fromIdx}
125 * @param needle the unicode code point character to search
126 * @param start index to start searching
127 * @return {@code -1} if not found, otherwise [0..{@link String#length()}-1].
128 * @see #indexOf(CharSequence, int, int)
129 * @see String#indexOf(int, int)
130 */
131 public static int indexOf(final String hay, final int needle, final int start) {
132 if( null != hay ) {
133 return hay.indexOf(needle, start);
134 }
135 return -1;
136 }
137 /**
138 * Naive implementation of {@link String#indexOf(int, int)} for type {@link CharSequence}.
139 * <p>
140 * Uses {@link String#indexOf(int, int)} if {@code hay} is of type {@link String},
141 * otherwise
142 * </p>
143 * @param hay the unicode character string to search in from {@code fromIdx}
144 * @param needle the unicode code point character to search
145 * @param start index to start searching
146 * @return {@code -1} if not found, otherwise [0..{@link String#length()}-1].
147 * @see #indexOf(String, char, int)
148 * @see String#indexOf(int, int)
149 */
150 public static int indexOf(final CharSequence hay, final int needle, final int start) {
151 if( null != hay ) {
152 if (hay instanceof String) {
153 return ((String) hay).indexOf(needle, start);
154 }
155 final int l = hay.length();
156 final int s = Math.max(0, start);
157 if ( l > s ) {
158 if (needle < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
159 for (int i = s; i < l; i++) {
160 if (hay.charAt(i) == needle) {
161 return i;
162 }
163 }
164 } else if (Character.isValidCodePoint(needle)) {
165 final char[] chars = Character.toChars(needle);
166 for (int i = s; i < l - 1; i++) {
167 final char hi = hay.charAt(i); // Character.toCodePoint(hi, lo);
168 final char lo = hay.charAt(i + 1);
169 if (hi == chars[0] && lo == chars[1]) {
170 return i;
171 }
172 }
173 }
174 }
175 }
176 return -1;
177 }
178
179 /**
180 * Remove all leading, trailing and duplicate-within {@code separators} unicode character from the {@code text}.
181 * <p>
182 * Duplicate {@code separators} unicode character within the {@code text} are reduced to one occurrence
183 * and might be replaced with {@code replacement} if not {@code null}.
184 * </p>
185 * @param text the source text
186 * @param separators separator unicode characters, pass {@code null} for {@link Character#isWhitespace(int) whitespace}.
187 * Consider using {@link #WHITESPACE} to cover all unicode space character.
188 * @param replacement optional replacement string for matched separator within sequence removing duplicated.
189 * If {@code null}, the first found separator is used.
190 * @return stripped text
191 */
192 public static String trim(final String text, final String separators, final String replacement) {
193 if (text == null ) {
194 return "";
195 }
196 final int len = text.length();
197 if (len == 0) {
198 return "";
199 }
200 final StringBuilder sb = new StringBuilder();
201 boolean lastMatched = false;
202 if (separators == null) {
203 for(int i=0; i < len; ++i) {
204 final int cp = text.codePointAt(i);
205 final boolean match = Character.isWhitespace(cp);
206 if ( !match || !lastMatched && 0 < i ) {
207 if( match && null != replacement ) {
208 sb.append(replacement);
209 } else {
210 sb.append(String.valueOf(Character.toChars(cp)));
211 }
212 }
213 lastMatched = match;
214 }
215 if(lastMatched) {
216 sb.deleteCharAt(sb.length()-1);
217 }
218 } else {
219 for(int i=0; i < len; ++i) {
220 final int cp = text.codePointAt(i);
221 final boolean match = separators.indexOf(cp) >= 0;
222 if ( !match || !lastMatched && 0 < i ) {
223 if( match && null != replacement ) {
224 sb.append(replacement);
225 } else {
226 sb.append(String.valueOf(Character.toChars(cp)));
227 }
228 }
229 lastMatched = match;
230 }
231 if(lastMatched) {
232 sb.deleteCharAt(sb.length()-1);
233 }
234 }
235 return sb.toString();
236 }
237
238 /**
239 * Returns an array of split {@code text} at {@code separators} or {@link Character#isWhitespace(int) whitespace}.
240 * <p>
241 * Each line's cutting point is the first {@code separator} or {@link Character#isWhitespace(int) whitespace}
242 * occurrence starting at {@code text.length() / lineCount * 0.9}.
243 * </p>
244 * <p>
245 * The separator or {@link Character#isWhitespace(int) whitespace} character
246 * at the cutting point is skipped in the resulting array of the split parts, i.e. lines.
247 * </p>
248 * @param text the text to be split, {@code null} results in an empty list
249 * @param lineCount number of resulting lines
250 * @param separators separator unicode characters, pass {@code null} for {@link Character#isWhitespace(int) whitespace}.
251 * Consider using {@link #WHITESPACE} to cover all unicode space character.
252 * @see #split(String, int, String, String)
253 */
254 public static List<String> split(final String text, final int lineCount, final String separators) {
255 final List<String> list = new ArrayList<>();
256 if (text == null || 0 == lineCount) {
257 return list;
258 }
259 final int len = text.length();
260 if (len == 0) {
261 return list;
262 }
263 if( 1 == lineCount ) {
264 list.add(text);
265 return list;
266 }
267 final int segLen = (int)Math.ceil((float)len / (float)lineCount * 0.9f);
268
269 int i = segLen;
270 int start = 0;
271 if (separators == null) {
272 while (i < len && list.size() < lineCount - 1) {
273 if (Character.isWhitespace(text.codePointAt(i))) {
274 list.add(text.substring(start, i));
275 start = i+1; // skip separator
276 i += segLen;
277 } else {
278 i++;
279 }
280 }
281 } else {
282 while (i < len && list.size() < lineCount - 1) {
283 if (separators.indexOf(text.codePointAt(i)) >= 0) {
284 list.add(text.substring(start, i));
285 start = i+1; // skip separator
286 i += segLen;
287 } else {
288 i++;
289 }
290 }
291 }
292 if( start < len ) {
293 list.add(text.substring(start, len));
294 }
295 return list;
296 }
297 /**
298 * Returns a multi-line string of split {@code text} at {@code separators} or {@link Character#isWhitespace(int) whitespace}
299 * glued with given {@code lineSeparator}.
300 * <p>
301 * Each line's cutting point is the first {@code separator} or {@link Character#isWhitespace(int) whitespace}
302 * occurrence starting at {@code text.length() / lineCount * 0.9}.
303 * </p>
304 * <p>
305 * The separator character or {@link Character#isWhitespace(int) whitespace}
306 * at the cutting point is skipped in the string of glued split parts, i.e. lines.
307 * </p>
308 * @param text the text to be split, {@code null} results in an empty list
309 * @param lineCount number of resulting lines
310 * @param separators separator unicode characters, pass {@code null} for {@link Character#isWhitespace(int) whitespace}.
311 * Consider using {@link #WHITESPACE} to cover all unicode space character.
312 * @param lineSeparator the glue placed between the split lines in the concatenated result
313 * @see #split(String, int, String)
314 */
315 public static String split(final String text, final int lineCount, final String separators, final String lineSeparator) {
316 final List<String> lines = split(text, lineCount, separators);
317 final StringBuilder sb = new StringBuilder();
318 boolean addGlue = false;
319 for(final String l : lines) {
320 if( addGlue ) {
321 sb.append(lineSeparator);
322 }
323 sb.append(l);
324 addGlue = true;
325 }
326 return sb.toString();
327 }
328
329}
Basic utility functions for String and CharSequence in general.
Definition: StringUtil.java:34
static boolean hasSpace(final int cp)
Returns true if given codepoint is either isFullwidth(int), isHalfwidth(int) or isWhitespace(int).
Definition: StringUtil.java:97
static List< String > split(final String text, final int lineCount, final String separators)
Returns an array of split text at separators or whitespace.
static int getLineCount(final CharSequence s)
Returns number of lines, i.e.
static final String WHITESPACE
List of ASCII & Unicode space separator, aka Whitespace.
Definition: StringUtil.java:50
static boolean isWhitespace(final int cp)
Return true if given codepoint in included within WHITESPACE.
Definition: StringUtil.java:77
static final char LF
Linefeed character unicode '\n', 0x000A.
Definition: StringUtil.java:36
static String trim(final String text, final String separators, final String replacement)
Remove all leading, trailing and duplicate-within separators unicode character from the text.
static final char SPACE
Space character unicode ' ', 0x0020.
Definition: StringUtil.java:40
static boolean isHalfwidth(final int cp)
Returns true if given codepoint is a halfwidth unicode character.
Definition: StringUtil.java:92
static boolean isFullwidth(final int cp)
Returns true if given codepoint is a fullwidth unicode character.
Definition: StringUtil.java:85
static String split(final String text, final int lineCount, final String separators, final String lineSeparator)
Returns a multi-line string of split text at separators or whitespace glued with given lineSeparator.
static int indexOf(final CharSequence hay, final int needle, final int start)
Naive implementation of String#indexOf(int, int) for type CharSequence.
static int indexOf(final String hay, final int needle, final int start)
Calls String#indexOf(int, int).
static final char CR
CR character unicode '\r', 0x000D.
Definition: StringUtil.java:38