WPILibC++ 2024.3.2
lexer.h
Go to the documentation of this file.
1// __ _____ _____ _____
2// __| | __| | | | JSON for Modern C++
3// | | |__ | | | | | | version 3.11.2
4// |_____|_____|_____|_|___| https://github.com/nlohmann/json
5//
6// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
7// SPDX-License-Identifier: MIT
8
9#pragma once
10
11#include <array> // array
12#include <clocale> // localeconv
13#include <cstddef> // size_t
14#include <cstdio> // snprintf
15#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
16#include <initializer_list> // initializer_list
17#include <string> // char_traits, string
18#include <utility> // move
19#include <vector> // vector
20
24
26namespace detail
27{
28
29///////////
30// lexer //
31///////////
32
33template<typename BasicJsonType>
35{
36 public:
37 /// token types for the parser
38 enum class token_type
39 {
40 uninitialized, ///< indicating the scanner is uninitialized
41 literal_true, ///< the `true` literal
42 literal_false, ///< the `false` literal
43 literal_null, ///< the `null` literal
44 value_string, ///< a string -- use get_string() for actual value
45 value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value
46 value_integer, ///< a signed integer -- use get_number_integer() for actual value
47 value_float, ///< an floating point number -- use get_number_float() for actual value
48 begin_array, ///< the character for array begin `[`
49 begin_object, ///< the character for object begin `{`
50 end_array, ///< the character for array end `]`
51 end_object, ///< the character for object end `}`
52 name_separator, ///< the name separator `:`
53 value_separator, ///< the value separator `,`
54 parse_error, ///< indicating a parse error
55 end_of_input, ///< indicating the end of the input buffer
56 literal_or_value ///< a literal or the begin of a value (only for diagnostics)
57 };
58
59 /// return name of values of type token_type (only used for errors)
62 static const char* token_type_name(const token_type t) noexcept
63 {
64 switch (t)
65 {
67 return "<uninitialized>";
69 return "true literal";
71 return "false literal";
73 return "null literal";
75 return "string literal";
79 return "number literal";
81 return "'['";
83 return "'{'";
85 return "']'";
87 return "'}'";
89 return "':'";
91 return "','";
93 return "<parse error>";
95 return "end of input";
97 return "'[', '{', or a literal";
98 // LCOV_EXCL_START
99 default: // catch non-enum values
100 return "unknown token";
101 // LCOV_EXCL_STOP
102 }
103 }
104};
105/*!
106@brief lexical analysis
107
108This class organizes the lexical analysis during JSON deserialization.
109*/
110template<typename BasicJsonType, typename InputAdapterType>
111class lexer : public lexer_base<BasicJsonType>
112{
113 using number_integer_t = typename BasicJsonType::number_integer_t;
114 using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
115 using number_float_t = typename BasicJsonType::number_float_t;
116 using string_t = typename BasicJsonType::string_t;
117 using char_type = typename InputAdapterType::char_type;
118 using char_int_type = typename std::char_traits<char_type>::int_type;
119
120 public:
122
123 explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept
124 : ia(std::move(adapter))
125 , ignore_comments(ignore_comments_)
126 , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
127 {}
128
129 // delete because of pointer members
130 lexer(const lexer&) = delete;
131 lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
132 lexer& operator=(lexer&) = delete;
133 lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
134 ~lexer() = default;
135
136 private:
137 /////////////////////
138 // locales
139 /////////////////////
140
141 /// return the locale-dependent decimal point
143 static char get_decimal_point() noexcept
144 {
145 const auto* loc = localeconv();
146 JSON_ASSERT(loc != nullptr);
147 return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
148 }
149
150 /////////////////////
151 // scan functions
152 /////////////////////
153
154 /*!
155 @brief get codepoint from 4 hex characters following `\u`
156
157 For input "\u c1 c2 c3 c4" the codepoint is:
158 (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
159 = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
160
161 Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
162 must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
163 conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
164 between the ASCII value of the character and the desired integer value.
165
166 @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
167 non-hex character)
168 */
169 int get_codepoint()
170 {
171 // this function only makes sense after reading `\u`
172 JSON_ASSERT(current == 'u');
173 int codepoint = 0;
174
175 const auto factors = { 12u, 8u, 4u, 0u };
176 for (const auto factor : factors)
177 {
178 get();
179
180 if (current >= '0' && current <= '9')
181 {
182 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
183 }
184 else if (current >= 'A' && current <= 'F')
185 {
186 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
187 }
188 else if (current >= 'a' && current <= 'f')
189 {
190 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
191 }
192 else
193 {
194 return -1;
195 }
196 }
197
198 JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
199 return codepoint;
200 }
201
202 /*!
203 @brief check if the next byte(s) are inside a given range
204
205 Adds the current byte and, for each passed range, reads a new byte and
206 checks if it is inside the range. If a violation was detected, set up an
207 error message and return false. Otherwise, return true.
208
209 @param[in] ranges list of integers; interpreted as list of pairs of
210 inclusive lower and upper bound, respectively
211
212 @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
213 1, 2, or 3 pairs. This precondition is enforced by an assertion.
214
215 @return true if and only if no range violation was detected
216 */
217 bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
218 {
219 JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
220 add(current);
221
222 for (auto range = ranges.begin(); range != ranges.end(); ++range)
223 {
224 get();
225 if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range)))
226 {
227 add(current);
228 }
229 else
230 {
231 error_message = "invalid string: ill-formed UTF-8 byte";
232 return false;
233 }
234 }
235
236 return true;
237 }
238
239 /*!
240 @brief scan a string literal
241
242 This function scans a string according to Sect. 7 of RFC 8259. While
243 scanning, bytes are escaped and copied into buffer token_buffer. Then the
244 function returns successfully, token_buffer is *not* null-terminated (as it
245 may contain \0 bytes), and token_buffer.size() is the number of bytes in the
246 string.
247
248 @return token_type::value_string if string could be successfully scanned,
249 token_type::parse_error otherwise
250
251 @note In case of errors, variable error_message contains a textual
252 description.
253 */
254 token_type scan_string()
255 {
256 // reset token_buffer (ignore opening quote)
257 reset();
258
259 // we entered the function by reading an open quote
260 JSON_ASSERT(current == '\"');
261
262 while (true)
263 {
264 // get next character
265 switch (get())
266 {
267 // end of file while parsing string
268 case std::char_traits<char_type>::eof():
269 {
270 error_message = "invalid string: missing closing quote";
271 return token_type::parse_error;
272 }
273
274 // closing quote
275 case '\"':
276 {
277 return token_type::value_string;
278 }
279
280 // escapes
281 case '\\':
282 {
283 switch (get())
284 {
285 // quotation mark
286 case '\"':
287 add('\"');
288 break;
289 // reverse solidus
290 case '\\':
291 add('\\');
292 break;
293 // solidus
294 case '/':
295 add('/');
296 break;
297 // backspace
298 case 'b':
299 add('\b');
300 break;
301 // form feed
302 case 'f':
303 add('\f');
304 break;
305 // line feed
306 case 'n':
307 add('\n');
308 break;
309 // carriage return
310 case 'r':
311 add('\r');
312 break;
313 // tab
314 case 't':
315 add('\t');
316 break;
317
318 // unicode escapes
319 case 'u':
320 {
321 const int codepoint1 = get_codepoint();
322 int codepoint = codepoint1; // start with codepoint1
323
324 if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
325 {
326 error_message = "invalid string: '\\u' must be followed by 4 hex digits";
327 return token_type::parse_error;
328 }
329
330 // check if code point is a high surrogate
331 if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
332 {
333 // expect next \uxxxx entry
334 if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
335 {
336 const int codepoint2 = get_codepoint();
337
338 if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
339 {
340 error_message = "invalid string: '\\u' must be followed by 4 hex digits";
341 return token_type::parse_error;
342 }
343
344 // check if codepoint2 is a low surrogate
345 if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
346 {
347 // overwrite codepoint
348 codepoint = static_cast<int>(
349 // high surrogate occupies the most significant 22 bits
350 (static_cast<unsigned int>(codepoint1) << 10u)
351 // low surrogate occupies the least significant 15 bits
352 + static_cast<unsigned int>(codepoint2)
353 // there is still the 0xD800, 0xDC00 and 0x10000 noise
354 // in the result, so we have to subtract with:
355 // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
356 - 0x35FDC00u);
357 }
358 else
359 {
360 error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
361 return token_type::parse_error;
362 }
363 }
364 else
365 {
366 error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
367 return token_type::parse_error;
368 }
369 }
370 else
371 {
372 if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
373 {
374 error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
375 return token_type::parse_error;
376 }
377 }
378
379 // result of the above calculation yields a proper codepoint
380 JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
381
382 // translate codepoint into bytes
383 if (codepoint < 0x80)
384 {
385 // 1-byte characters: 0xxxxxxx (ASCII)
386 add(static_cast<char_int_type>(codepoint));
387 }
388 else if (codepoint <= 0x7FF)
389 {
390 // 2-byte characters: 110xxxxx 10xxxxxx
391 add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
392 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
393 }
394 else if (codepoint <= 0xFFFF)
395 {
396 // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
397 add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
398 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
399 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
400 }
401 else
402 {
403 // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
404 add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
405 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
406 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
407 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
408 }
409
410 break;
411 }
412
413 // other characters after escape
414 default:
415 error_message = "invalid string: forbidden character after backslash";
416 return token_type::parse_error;
417 }
418
419 break;
420 }
421
422 // invalid control characters
423 case 0x00:
424 {
425 error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
426 return token_type::parse_error;
427 }
428
429 case 0x01:
430 {
431 error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
432 return token_type::parse_error;
433 }
434
435 case 0x02:
436 {
437 error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
438 return token_type::parse_error;
439 }
440
441 case 0x03:
442 {
443 error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
444 return token_type::parse_error;
445 }
446
447 case 0x04:
448 {
449 error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
450 return token_type::parse_error;
451 }
452
453 case 0x05:
454 {
455 error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
456 return token_type::parse_error;
457 }
458
459 case 0x06:
460 {
461 error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
462 return token_type::parse_error;
463 }
464
465 case 0x07:
466 {
467 error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
468 return token_type::parse_error;
469 }
470
471 case 0x08:
472 {
473 error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
474 return token_type::parse_error;
475 }
476
477 case 0x09:
478 {
479 error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
480 return token_type::parse_error;
481 }
482
483 case 0x0A:
484 {
485 error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
486 return token_type::parse_error;
487 }
488
489 case 0x0B:
490 {
491 error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
492 return token_type::parse_error;
493 }
494
495 case 0x0C:
496 {
497 error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
498 return token_type::parse_error;
499 }
500
501 case 0x0D:
502 {
503 error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
504 return token_type::parse_error;
505 }
506
507 case 0x0E:
508 {
509 error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
510 return token_type::parse_error;
511 }
512
513 case 0x0F:
514 {
515 error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
516 return token_type::parse_error;
517 }
518
519 case 0x10:
520 {
521 error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
522 return token_type::parse_error;
523 }
524
525 case 0x11:
526 {
527 error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
528 return token_type::parse_error;
529 }
530
531 case 0x12:
532 {
533 error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
534 return token_type::parse_error;
535 }
536
537 case 0x13:
538 {
539 error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
540 return token_type::parse_error;
541 }
542
543 case 0x14:
544 {
545 error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
546 return token_type::parse_error;
547 }
548
549 case 0x15:
550 {
551 error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
552 return token_type::parse_error;
553 }
554
555 case 0x16:
556 {
557 error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
558 return token_type::parse_error;
559 }
560
561 case 0x17:
562 {
563 error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
564 return token_type::parse_error;
565 }
566
567 case 0x18:
568 {
569 error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
570 return token_type::parse_error;
571 }
572
573 case 0x19:
574 {
575 error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
576 return token_type::parse_error;
577 }
578
579 case 0x1A:
580 {
581 error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
582 return token_type::parse_error;
583 }
584
585 case 0x1B:
586 {
587 error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
588 return token_type::parse_error;
589 }
590
591 case 0x1C:
592 {
593 error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
594 return token_type::parse_error;
595 }
596
597 case 0x1D:
598 {
599 error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
600 return token_type::parse_error;
601 }
602
603 case 0x1E:
604 {
605 error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
606 return token_type::parse_error;
607 }
608
609 case 0x1F:
610 {
611 error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
612 return token_type::parse_error;
613 }
614
615 // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
616 case 0x20:
617 case 0x21:
618 case 0x23:
619 case 0x24:
620 case 0x25:
621 case 0x26:
622 case 0x27:
623 case 0x28:
624 case 0x29:
625 case 0x2A:
626 case 0x2B:
627 case 0x2C:
628 case 0x2D:
629 case 0x2E:
630 case 0x2F:
631 case 0x30:
632 case 0x31:
633 case 0x32:
634 case 0x33:
635 case 0x34:
636 case 0x35:
637 case 0x36:
638 case 0x37:
639 case 0x38:
640 case 0x39:
641 case 0x3A:
642 case 0x3B:
643 case 0x3C:
644 case 0x3D:
645 case 0x3E:
646 case 0x3F:
647 case 0x40:
648 case 0x41:
649 case 0x42:
650 case 0x43:
651 case 0x44:
652 case 0x45:
653 case 0x46:
654 case 0x47:
655 case 0x48:
656 case 0x49:
657 case 0x4A:
658 case 0x4B:
659 case 0x4C:
660 case 0x4D:
661 case 0x4E:
662 case 0x4F:
663 case 0x50:
664 case 0x51:
665 case 0x52:
666 case 0x53:
667 case 0x54:
668 case 0x55:
669 case 0x56:
670 case 0x57:
671 case 0x58:
672 case 0x59:
673 case 0x5A:
674 case 0x5B:
675 case 0x5D:
676 case 0x5E:
677 case 0x5F:
678 case 0x60:
679 case 0x61:
680 case 0x62:
681 case 0x63:
682 case 0x64:
683 case 0x65:
684 case 0x66:
685 case 0x67:
686 case 0x68:
687 case 0x69:
688 case 0x6A:
689 case 0x6B:
690 case 0x6C:
691 case 0x6D:
692 case 0x6E:
693 case 0x6F:
694 case 0x70:
695 case 0x71:
696 case 0x72:
697 case 0x73:
698 case 0x74:
699 case 0x75:
700 case 0x76:
701 case 0x77:
702 case 0x78:
703 case 0x79:
704 case 0x7A:
705 case 0x7B:
706 case 0x7C:
707 case 0x7D:
708 case 0x7E:
709 case 0x7F:
710 {
711 add(current);
712 break;
713 }
714
715 // U+0080..U+07FF: bytes C2..DF 80..BF
716 case 0xC2:
717 case 0xC3:
718 case 0xC4:
719 case 0xC5:
720 case 0xC6:
721 case 0xC7:
722 case 0xC8:
723 case 0xC9:
724 case 0xCA:
725 case 0xCB:
726 case 0xCC:
727 case 0xCD:
728 case 0xCE:
729 case 0xCF:
730 case 0xD0:
731 case 0xD1:
732 case 0xD2:
733 case 0xD3:
734 case 0xD4:
735 case 0xD5:
736 case 0xD6:
737 case 0xD7:
738 case 0xD8:
739 case 0xD9:
740 case 0xDA:
741 case 0xDB:
742 case 0xDC:
743 case 0xDD:
744 case 0xDE:
745 case 0xDF:
746 {
747 if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
748 {
749 return token_type::parse_error;
750 }
751 break;
752 }
753
754 // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
755 case 0xE0:
756 {
757 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
758 {
759 return token_type::parse_error;
760 }
761 break;
762 }
763
764 // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
765 // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
766 case 0xE1:
767 case 0xE2:
768 case 0xE3:
769 case 0xE4:
770 case 0xE5:
771 case 0xE6:
772 case 0xE7:
773 case 0xE8:
774 case 0xE9:
775 case 0xEA:
776 case 0xEB:
777 case 0xEC:
778 case 0xEE:
779 case 0xEF:
780 {
781 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
782 {
783 return token_type::parse_error;
784 }
785 break;
786 }
787
788 // U+D000..U+D7FF: bytes ED 80..9F 80..BF
789 case 0xED:
790 {
791 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
792 {
793 return token_type::parse_error;
794 }
795 break;
796 }
797
798 // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
799 case 0xF0:
800 {
801 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
802 {
803 return token_type::parse_error;
804 }
805 break;
806 }
807
808 // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
809 case 0xF1:
810 case 0xF2:
811 case 0xF3:
812 {
813 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
814 {
815 return token_type::parse_error;
816 }
817 break;
818 }
819
820 // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
821 case 0xF4:
822 {
823 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
824 {
825 return token_type::parse_error;
826 }
827 break;
828 }
829
830 // remaining bytes (80..C1 and F5..FF) are ill-formed
831 default:
832 {
833 error_message = "invalid string: ill-formed UTF-8 byte";
834 return token_type::parse_error;
835 }
836 }
837 }
838 }
839
840 /*!
841 * @brief scan a comment
842 * @return whether comment could be scanned successfully
843 */
844 bool scan_comment()
845 {
846 switch (get())
847 {
848 // single-line comments skip input until a newline or EOF is read
849 case '/':
850 {
851 while (true)
852 {
853 switch (get())
854 {
855 case '\n':
856 case '\r':
857 case std::char_traits<char_type>::eof():
858 case '\0':
859 return true;
860
861 default:
862 break;
863 }
864 }
865 }
866
867 // multi-line comments skip input until */ is read
868 case '*':
869 {
870 while (true)
871 {
872 switch (get())
873 {
874 case std::char_traits<char_type>::eof():
875 case '\0':
876 {
877 error_message = "invalid comment; missing closing '*/'";
878 return false;
879 }
880
881 case '*':
882 {
883 switch (get())
884 {
885 case '/':
886 return true;
887
888 default:
889 {
890 unget();
891 continue;
892 }
893 }
894 }
895
896 default:
897 continue;
898 }
899 }
900 }
901
902 // unexpected character after reading '/'
903 default:
904 {
905 error_message = "invalid comment; expecting '/' or '*' after '/'";
906 return false;
907 }
908 }
909 }
910
912 static void strtof(float& f, const char* str, char** endptr) noexcept
913 {
914 f = std::strtof(str, endptr);
915 }
916
918 static void strtof(double& f, const char* str, char** endptr) noexcept
919 {
920 f = std::strtod(str, endptr);
921 }
922
924 static void strtof(long double& f, const char* str, char** endptr) noexcept
925 {
926 f = std::strtold(str, endptr);
927 }
928
929 /*!
930 @brief scan a number literal
931
932 This function scans a string according to Sect. 6 of RFC 8259.
933
934 The function is realized with a deterministic finite state machine derived
935 from the grammar described in RFC 8259. Starting in state "init", the
936 input is read and used to determined the next state. Only state "done"
937 accepts the number. State "error" is a trap state to model errors. In the
938 table below, "anything" means any character but the ones listed before.
939
940 state | 0 | 1-9 | e E | + | - | . | anything
941 ---------|----------|----------|----------|---------|---------|----------|-----------
942 init | zero | any1 | [error] | [error] | minus | [error] | [error]
943 minus | zero | any1 | [error] | [error] | [error] | [error] | [error]
944 zero | done | done | exponent | done | done | decimal1 | done
945 any1 | any1 | any1 | exponent | done | done | decimal1 | done
946 decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error]
947 decimal2 | decimal2 | decimal2 | exponent | done | done | done | done
948 exponent | any2 | any2 | [error] | sign | sign | [error] | [error]
949 sign | any2 | any2 | [error] | [error] | [error] | [error] | [error]
950 any2 | any2 | any2 | done | done | done | done | done
951
952 The state machine is realized with one label per state (prefixed with
953 "scan_number_") and `goto` statements between them. The state machine
954 contains cycles, but any cycle can be left when EOF is read. Therefore,
955 the function is guaranteed to terminate.
956
957 During scanning, the read bytes are stored in token_buffer. This string is
958 then converted to a signed integer, an unsigned integer, or a
959 floating-point number.
960
961 @return token_type::value_unsigned, token_type::value_integer, or
962 token_type::value_float if number could be successfully scanned,
963 token_type::parse_error otherwise
964
965 @note The scanner is independent of the current locale. Internally, the
966 locale's decimal point is used instead of `.` to work with the
967 locale-dependent converters.
968 */
969 token_type scan_number() // lgtm [cpp/use-of-goto]
970 {
971 // reset token_buffer to store the number's bytes
972 reset();
973
974 // the type of the parsed number; initially set to unsigned; will be
975 // changed if minus sign, decimal point or exponent is read
976 token_type number_type = token_type::value_unsigned;
977
978 // state (init): we just found out we need to scan a number
979 switch (current)
980 {
981 case '-':
982 {
983 add(current);
984 goto scan_number_minus;
985 }
986
987 case '0':
988 {
989 add(current);
990 goto scan_number_zero;
991 }
992
993 case '1':
994 case '2':
995 case '3':
996 case '4':
997 case '5':
998 case '6':
999 case '7':
1000 case '8':
1001 case '9':
1002 {
1003 add(current);
1004 goto scan_number_any1;
1005 }
1006
1007 // all other characters are rejected outside scan_number()
1008 default: // LCOV_EXCL_LINE
1009 JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
1010 }
1011
1012scan_number_minus:
1013 // state: we just parsed a leading minus sign
1014 number_type = token_type::value_integer;
1015 switch (get())
1016 {
1017 case '0':
1018 {
1019 add(current);
1020 goto scan_number_zero;
1021 }
1022
1023 case '1':
1024 case '2':
1025 case '3':
1026 case '4':
1027 case '5':
1028 case '6':
1029 case '7':
1030 case '8':
1031 case '9':
1032 {
1033 add(current);
1034 goto scan_number_any1;
1035 }
1036
1037 default:
1038 {
1039 error_message = "invalid number; expected digit after '-'";
1040 return token_type::parse_error;
1041 }
1042 }
1043
1044scan_number_zero:
1045 // state: we just parse a zero (maybe with a leading minus sign)
1046 switch (get())
1047 {
1048 case '.':
1049 {
1050 add(decimal_point_char);
1051 goto scan_number_decimal1;
1052 }
1053
1054 case 'e':
1055 case 'E':
1056 {
1057 add(current);
1058 goto scan_number_exponent;
1059 }
1060
1061 default:
1062 goto scan_number_done;
1063 }
1064
1065scan_number_any1:
1066 // state: we just parsed a number 0-9 (maybe with a leading minus sign)
1067 switch (get())
1068 {
1069 case '0':
1070 case '1':
1071 case '2':
1072 case '3':
1073 case '4':
1074 case '5':
1075 case '6':
1076 case '7':
1077 case '8':
1078 case '9':
1079 {
1080 add(current);
1081 goto scan_number_any1;
1082 }
1083
1084 case '.':
1085 {
1086 add(decimal_point_char);
1087 goto scan_number_decimal1;
1088 }
1089
1090 case 'e':
1091 case 'E':
1092 {
1093 add(current);
1094 goto scan_number_exponent;
1095 }
1096
1097 default:
1098 goto scan_number_done;
1099 }
1100
1101scan_number_decimal1:
1102 // state: we just parsed a decimal point
1103 number_type = token_type::value_float;
1104 switch (get())
1105 {
1106 case '0':
1107 case '1':
1108 case '2':
1109 case '3':
1110 case '4':
1111 case '5':
1112 case '6':
1113 case '7':
1114 case '8':
1115 case '9':
1116 {
1117 add(current);
1118 goto scan_number_decimal2;
1119 }
1120
1121 default:
1122 {
1123 error_message = "invalid number; expected digit after '.'";
1124 return token_type::parse_error;
1125 }
1126 }
1127
1128scan_number_decimal2:
1129 // we just parsed at least one number after a decimal point
1130 switch (get())
1131 {
1132 case '0':
1133 case '1':
1134 case '2':
1135 case '3':
1136 case '4':
1137 case '5':
1138 case '6':
1139 case '7':
1140 case '8':
1141 case '9':
1142 {
1143 add(current);
1144 goto scan_number_decimal2;
1145 }
1146
1147 case 'e':
1148 case 'E':
1149 {
1150 add(current);
1151 goto scan_number_exponent;
1152 }
1153
1154 default:
1155 goto scan_number_done;
1156 }
1157
1158scan_number_exponent:
1159 // we just parsed an exponent
1160 number_type = token_type::value_float;
1161 switch (get())
1162 {
1163 case '+':
1164 case '-':
1165 {
1166 add(current);
1167 goto scan_number_sign;
1168 }
1169
1170 case '0':
1171 case '1':
1172 case '2':
1173 case '3':
1174 case '4':
1175 case '5':
1176 case '6':
1177 case '7':
1178 case '8':
1179 case '9':
1180 {
1181 add(current);
1182 goto scan_number_any2;
1183 }
1184
1185 default:
1186 {
1187 error_message =
1188 "invalid number; expected '+', '-', or digit after exponent";
1189 return token_type::parse_error;
1190 }
1191 }
1192
1193scan_number_sign:
1194 // we just parsed an exponent sign
1195 switch (get())
1196 {
1197 case '0':
1198 case '1':
1199 case '2':
1200 case '3':
1201 case '4':
1202 case '5':
1203 case '6':
1204 case '7':
1205 case '8':
1206 case '9':
1207 {
1208 add(current);
1209 goto scan_number_any2;
1210 }
1211
1212 default:
1213 {
1214 error_message = "invalid number; expected digit after exponent sign";
1215 return token_type::parse_error;
1216 }
1217 }
1218
1219scan_number_any2:
1220 // we just parsed a number after the exponent or exponent sign
1221 switch (get())
1222 {
1223 case '0':
1224 case '1':
1225 case '2':
1226 case '3':
1227 case '4':
1228 case '5':
1229 case '6':
1230 case '7':
1231 case '8':
1232 case '9':
1233 {
1234 add(current);
1235 goto scan_number_any2;
1236 }
1237
1238 default:
1239 goto scan_number_done;
1240 }
1241
1242scan_number_done:
1243 // unget the character after the number (we only read it to know that
1244 // we are done scanning a number)
1245 unget();
1246
1247 char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
1248 errno = 0;
1249
1250 // try to parse integers first and fall back to floats
1251 if (number_type == token_type::value_unsigned)
1252 {
1253 const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
1254
1255 // we checked the number format before
1256 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1257
1258 if (errno == 0)
1259 {
1260 value_unsigned = static_cast<number_unsigned_t>(x);
1261 if (value_unsigned == x)
1262 {
1263 return token_type::value_unsigned;
1264 }
1265 }
1266 }
1267 else if (number_type == token_type::value_integer)
1268 {
1269 const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
1270
1271 // we checked the number format before
1272 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1273
1274 if (errno == 0)
1275 {
1276 value_integer = static_cast<number_integer_t>(x);
1277 if (value_integer == x)
1278 {
1279 return token_type::value_integer;
1280 }
1281 }
1282 }
1283
1284 // this code is reached if we parse a floating-point number or if an
1285 // integer conversion above failed
1286 strtof(value_float, token_buffer.data(), &endptr);
1287
1288 // we checked the number format before
1289 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1290
1291 return token_type::value_float;
1292 }
1293
1294 /*!
1295 @param[in] literal_text the literal text to expect
1296 @param[in] length the length of the passed literal text
1297 @param[in] return_type the token type to return on success
1298 */
1300 token_type scan_literal(const char_type* literal_text, const std::size_t length,
1301 token_type return_type)
1302 {
1303 JSON_ASSERT(std::char_traits<char_type>::to_char_type(current) == literal_text[0]);
1304 for (std::size_t i = 1; i < length; ++i)
1305 {
1306 if (JSON_HEDLEY_UNLIKELY(std::char_traits<char_type>::to_char_type(get()) != literal_text[i]))
1307 {
1308 error_message = "invalid literal";
1309 return token_type::parse_error;
1310 }
1311 }
1312 return return_type;
1313 }
1314
1315 /////////////////////
1316 // input management
1317 /////////////////////
1318
1319 /// reset token_buffer; current character is beginning of token
1320 void reset() noexcept
1321 {
1322 token_buffer.clear();
1323 token_string.clear();
1324 token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1325 }
1326
1327 /*
1328 @brief get next character from the input
1329
1330 This function provides the interface to the used input adapter. It does
1331 not throw in case the input reached EOF, but returns a
1332 `std::char_traits<char>::eof()` in that case. Stores the scanned characters
1333 for use in error messages.
1334
1335 @return character read from the input
1336 */
1337 char_int_type get()
1338 {
1339 ++position.chars_read_total;
1340 ++position.chars_read_current_line;
1341
1342 if (next_unget)
1343 {
1344 // just reset the next_unget variable and work with current
1345 next_unget = false;
1346 }
1347 else
1348 {
1349 current = ia.get_character();
1350 }
1351
1352 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1353 {
1354 token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1355 }
1356
1357 if (current == '\n')
1358 {
1359 ++position.lines_read;
1360 position.chars_read_current_line = 0;
1361 }
1362
1363 return current;
1364 }
1365
1366 /*!
1367 @brief unget current character (read it again on next get)
1368
1369 We implement unget by setting variable next_unget to true. The input is not
1370 changed - we just simulate ungetting by modifying chars_read_total,
1371 chars_read_current_line, and token_string. The next call to get() will
1372 behave as if the unget character is read again.
1373 */
1374 void unget()
1375 {
1376 next_unget = true;
1377
1378 --position.chars_read_total;
1379
1380 // in case we "unget" a newline, we have to also decrement the lines_read
1381 if (position.chars_read_current_line == 0)
1382 {
1383 if (position.lines_read > 0)
1384 {
1385 --position.lines_read;
1386 }
1387 }
1388 else
1389 {
1390 --position.chars_read_current_line;
1391 }
1392
1393 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1394 {
1395 JSON_ASSERT(!token_string.empty());
1396 token_string.pop_back();
1397 }
1398 }
1399
1400 /// add a character to token_buffer
1401 void add(char_int_type c)
1402 {
1403 token_buffer.push_back(static_cast<typename string_t::value_type>(c));
1404 }
1405
1406 public:
1407 /////////////////////
1408 // value getters
1409 /////////////////////
1410
1411 /// return integer value
1412 constexpr number_integer_t get_number_integer() const noexcept
1413 {
1414 return value_integer;
1415 }
1416
1417 /// return unsigned integer value
1418 constexpr number_unsigned_t get_number_unsigned() const noexcept
1419 {
1420 return value_unsigned;
1421 }
1422
1423 /// return floating-point value
1424 constexpr number_float_t get_number_float() const noexcept
1425 {
1426 return value_float;
1427 }
1428
1429 /// return current string value (implicitly resets the token; useful only once)
1430 string_t& get_string()
1431 {
1432 return token_buffer;
1433 }
1434
1435 /////////////////////
1436 // diagnostics
1437 /////////////////////
1438
1439 /// return position of last read token
1440 constexpr position_t get_position() const noexcept
1441 {
1442 return position;
1443 }
1444
1445 /// return the last read token (for errors only). Will never contain EOF
1446 /// (an arbitrary value that is not a valid char value, often -1), because
1447 /// 255 may legitimately occur. May contain NUL, which should be escaped.
1448 std::string get_token_string() const
1449 {
1450 // escape control characters
1451 std::string result;
1452 for (const auto c : token_string)
1453 {
1454 if (static_cast<unsigned char>(c) <= '\x1F')
1455 {
1456 // escape control characters
1457 std::array<char, 9> cs{{}};
1458 static_cast<void>((std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
1459 result += cs.data();
1460 }
1461 else
1462 {
1463 // add character as is
1464 result.push_back(static_cast<std::string::value_type>(c));
1465 }
1466 }
1467
1468 return result;
1469 }
1470
1471 /// return syntax error message
1473 constexpr const char* get_error_message() const noexcept
1474 {
1475 return error_message;
1476 }
1477
1478 /////////////////////
1479 // actual scanner
1480 /////////////////////
1481
1482 /*!
1483 @brief skip the UTF-8 byte order mark
1484 @return true iff there is no BOM or the correct BOM has been skipped
1485 */
1487 {
1488 if (get() == 0xEF)
1489 {
1490 // check if we completely parse the BOM
1491 return get() == 0xBB && get() == 0xBF;
1492 }
1493
1494 // the first character is not the beginning of the BOM; unget it to
1495 // process is later
1496 unget();
1497 return true;
1498 }
1499
1501 {
1502 do
1503 {
1504 get();
1505 }
1506 while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
1507 }
1508
1510 {
1511 // initially, skip the BOM
1512 if (position.chars_read_total == 0 && !skip_bom())
1513 {
1514 error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
1515 return token_type::parse_error;
1516 }
1517
1518 // read next character and ignore whitespace
1519 skip_whitespace();
1520
1521 // ignore comments
1522 while (ignore_comments && current == '/')
1523 {
1524 if (!scan_comment())
1525 {
1526 return token_type::parse_error;
1527 }
1528
1529 // skip following whitespace
1530 skip_whitespace();
1531 }
1532
1533 switch (current)
1534 {
1535 // structural characters
1536 case '[':
1537 return token_type::begin_array;
1538 case ']':
1539 return token_type::end_array;
1540 case '{':
1541 return token_type::begin_object;
1542 case '}':
1543 return token_type::end_object;
1544 case ':':
1545 return token_type::name_separator;
1546 case ',':
1547 return token_type::value_separator;
1548
1549 // literals
1550 case 't':
1551 {
1552 std::array<char_type, 4> true_literal = {{static_cast<char_type>('t'), static_cast<char_type>('r'), static_cast<char_type>('u'), static_cast<char_type>('e')}};
1553 return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
1554 }
1555 case 'f':
1556 {
1557 std::array<char_type, 5> false_literal = {{static_cast<char_type>('f'), static_cast<char_type>('a'), static_cast<char_type>('l'), static_cast<char_type>('s'), static_cast<char_type>('e')}};
1558 return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
1559 }
1560 case 'n':
1561 {
1562 std::array<char_type, 4> null_literal = {{static_cast<char_type>('n'), static_cast<char_type>('u'), static_cast<char_type>('l'), static_cast<char_type>('l')}};
1563 return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
1564 }
1565
1566 // string
1567 case '\"':
1568 return scan_string();
1569
1570 // number
1571 case '-':
1572 case '0':
1573 case '1':
1574 case '2':
1575 case '3':
1576 case '4':
1577 case '5':
1578 case '6':
1579 case '7':
1580 case '8':
1581 case '9':
1582 return scan_number();
1583
1584 // end of input (the null byte is needed when parsing from
1585 // string literals)
1586 case '\0':
1587 case std::char_traits<char_type>::eof():
1588 return token_type::end_of_input;
1589
1590 // error
1591 default:
1592 error_message = "invalid literal";
1593 return token_type::parse_error;
1594 }
1595 }
1596
1597 private:
1598 /// input adapter
1599 InputAdapterType ia;
1600
1601 /// whether comments should be ignored (true) or signaled as errors (false)
1602 const bool ignore_comments = false;
1603
1604 /// the current character
1605 char_int_type current = std::char_traits<char_type>::eof();
1606
1607 /// whether the next get() call should just return current
1608 bool next_unget = false;
1609
1610 /// the start position of the current token
1611 position_t position {};
1612
1613 /// raw input token string (for error messages)
1614 std::vector<char_type> token_string {};
1615
1616 /// buffer for variable-length tokens (numbers, strings)
1617 string_t token_buffer {};
1618
1619 /// a description of occurred lexer errors
1620 const char* error_message = "";
1621
1622 // number values
1623 number_integer_t value_integer = 0;
1624 number_unsigned_t value_unsigned = 0;
1625 number_float_t value_float = 0;
1626
1627 /// the decimal point
1628 const char_int_type decimal_point_char = '.';
1629};
1630
1631} // namespace detail
#define WPI_JSON_NAMESPACE_END
Definition: abi_macros.h:59
#define WPI_JSON_NAMESPACE_BEGIN
Definition: abi_macros.h:53
Definition: lexer.h:35
JSON_HEDLEY_RETURNS_NON_NULL static JSON_HEDLEY_CONST const char * token_type_name(const token_type t) noexcept
return name of values of type token_type (only used for errors)
Definition: lexer.h:62
token_type
token types for the parser
Definition: lexer.h:39
@ value_float
an floating point number – use get_number_float() for actual value
@ begin_array
the character for array begin [
@ value_string
a string – use get_string() for actual value
@ end_array
the character for array end ]
@ uninitialized
indicating the scanner is uninitialized
@ parse_error
indicating a parse error
@ value_integer
a signed integer – use get_number_integer() for actual value
@ value_separator
the value separator ,
@ end_object
the character for object end }
@ literal_true
the true literal
@ begin_object
the character for object begin {
@ value_unsigned
an unsigned integer – use get_number_unsigned() for actual value
@ literal_null
the null literal
@ end_of_input
indicating the end of the input buffer
@ name_separator
the name separator :
@ literal_or_value
a literal or the begin of a value (only for diagnostics)
@ literal_false
the false literal
lexical analysis
Definition: lexer.h:112
bool skip_bom()
skip the UTF-8 byte order mark
Definition: lexer.h:1486
void skip_whitespace()
Definition: lexer.h:1500
lexer(InputAdapterType &&adapter, bool ignore_comments_=false) noexcept
Definition: lexer.h:123
JSON_HEDLEY_RETURNS_NON_NULL constexpr const char * get_error_message() const noexcept
return syntax error message
Definition: lexer.h:1473
std::string get_token_string() const
return the last read token (for errors only).
Definition: lexer.h:1448
constexpr number_integer_t get_number_integer() const noexcept
return integer value
Definition: lexer.h:1412
constexpr position_t get_position() const noexcept
return position of last read token
Definition: lexer.h:1440
token_type scan()
Definition: lexer.h:1509
constexpr number_unsigned_t get_number_unsigned() const noexcept
return unsigned integer value
Definition: lexer.h:1418
typename lexer_base< BasicJsonType >::token_type token_type
Definition: lexer.h:121
lexer(lexer &&)=default
~lexer()=default
lexer & operator=(lexer &&)=default
lexer(const lexer &)=delete
lexer & operator=(lexer &)=delete
string_t & get_string()
return current string value (implicitly resets the token; useful only once)
Definition: lexer.h:1430
constexpr number_float_t get_number_float() const noexcept
return floating-point value
Definition: lexer.h:1424
exception indicating a parse error
Definition: exceptions.h:135
#define JSON_HEDLEY_CONST
Definition: hedley.h:1500
#define JSON_HEDLEY_LIKELY(expr)
Definition: hedley.h:1395
#define JSON_HEDLEY_NON_NULL(...)
Definition: hedley.h:1288
#define JSON_HEDLEY_RETURNS_NON_NULL
Definition: hedley.h:1729
#define JSON_HEDLEY_UNLIKELY(expr)
Definition: hedley.h:1396
#define JSON_HEDLEY_PURE
Definition: hedley.h:1469
#define JSON_ASSERT(x)
Definition: macro_scope.h:192
CameraServer (cscore) namespace.
Definition: cscore_oo.inc:16
detail namespace with internal helper functions
Definition: xchar.h:20
auto get(const wpi::detail::iteration_proxy_value< IteratorType > &i) -> decltype(i.key())
Definition: iteration_proxy.h:193
Definition: array.h:89
static constexpr const velocity::meters_per_second_t c(299792458.0)
Speed of light in vacuum.
struct to capture the start position of the current token
Definition: position_t.h:21