You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

2388 regels
87 KiB

  1. from __future__ import annotations
  2. # Scanner produces tokens of the following types:
  3. # STREAM-START
  4. # STREAM-END
  5. # DIRECTIVE(name, value)
  6. # DOCUMENT-START
  7. # DOCUMENT-END
  8. # BLOCK-SEQUENCE-START
  9. # BLOCK-MAPPING-START
  10. # BLOCK-END
  11. # FLOW-SEQUENCE-START
  12. # FLOW-MAPPING-START
  13. # FLOW-SEQUENCE-END
  14. # FLOW-MAPPING-END
  15. # BLOCK-ENTRY
  16. # FLOW-ENTRY
  17. # KEY
  18. # VALUE
  19. # ALIAS(value)
  20. # ANCHOR(value)
  21. # TAG(value)
  22. # SCALAR(value, plain, style)
  23. #
  24. # RoundTripScanner
  25. # COMMENT(value)
  26. #
  27. # Read comments in the Scanner code for more details.
  28. #
  29. from ruamel.yaml.error import MarkedYAMLError, CommentMark # NOQA
  30. from ruamel.yaml.tokens import * # NOQA
  31. from ruamel.yaml.docinfo import Version, Tag # NOQA
  32. from ruamel.yaml.compat import check_anchorname_char, _debug, nprint, nprintf # NOQA
  33. if False: # MYPY
  34. from typing import Any, Dict, Optional, List, Union, Text, Tuple # NOQA
  35. __all__ = ['Scanner', 'RoundTripScanner', 'ScannerError']
  36. _THE_END = '\n\0\r\x85\u2028\u2029'
  37. _THE_END_SPACE_TAB = ' \n\0\t\r\x85\u2028\u2029'
  38. _SPACE_TAB = ' \t'
  39. if _debug != 0:
  40. def xprintf(*args: Any, **kw: Any) -> Any:
  41. return nprintf(*args, **kw)
  42. class ScannerError(MarkedYAMLError):
  43. pass
  44. class SimpleKey:
  45. # See below simple keys treatment.
  46. def __init__(
  47. self, token_number: Any, required: Any, index: int, line: int, column: int, mark: Any,
  48. ) -> None:
  49. self.token_number = token_number
  50. self.required = required
  51. self.index = index
  52. self.line = line
  53. self.column = column
  54. self.mark = mark
  55. class Scanner:
  56. def __init__(self, loader: Any = None) -> None:
  57. """Initialize the scanner."""
  58. # It is assumed that Scanner and Reader will have a common descendant.
  59. # Reader do the dirty work of checking for BOM and converting the
  60. # input data to Unicode. It also adds NUL to the end.
  61. #
  62. # Reader supports the following methods
  63. # self.peek(i=0) # peek the next i-th character
  64. # self.prefix(l=1) # peek the next l characters
  65. # self.forward(l=1) # read the next l characters and move the pointer
  66. self.loader = loader
  67. if self.loader is not None and getattr(self.loader, '_scanner', None) is None:
  68. self.loader._scanner = self
  69. self.reset_scanner()
  70. self.first_time = False
  71. @property
  72. def flow_level(self) -> int:
  73. return len(self.flow_context)
  74. def reset_scanner(self) -> None:
  75. # Had we reached the end of the stream?
  76. self.done = False
  77. # flow_context is an expanding/shrinking list consisting of '{' and '['
  78. # for each unclosed flow context. If empty list that means block context
  79. self.flow_context: List[Text] = []
  80. # List of processed tokens that are not yet emitted.
  81. self.tokens: List[Any] = []
  82. # Add the STREAM-START token.
  83. self.fetch_stream_start()
  84. # Number of tokens that were emitted through the `get_token` method.
  85. self.tokens_taken = 0
  86. # The current indentation level.
  87. self.indent = -1
  88. # Past indentation levels.
  89. self.indents: List[int] = []
  90. # Variables related to simple keys treatment.
  91. # A simple key is a key that is not denoted by the '?' indicator.
  92. # Example of simple keys:
  93. # ---
  94. # block simple key: value
  95. # ? not a simple key:
  96. # : { flow simple key: value }
  97. # We emit the KEY token before all keys, so when we find a potential
  98. # simple key, we try to locate the corresponding ':' indicator.
  99. # Simple keys should be limited to a single line and 1024 characters.
  100. # Can a simple key start at the current position? A simple key may
  101. # start:
  102. # - at the beginning of the line, not counting indentation spaces
  103. # (in block context),
  104. # - after '{', '[', ',' (in the flow context),
  105. # - after '?', ':', '-' (in the block context).
  106. # In the block context, this flag also signifies if a block collection
  107. # may start at the current position.
  108. self.allow_simple_key = True
  109. # Keep track of possible simple keys. This is a dictionary. The key
  110. # is `flow_level`; there can be no more that one possible simple key
  111. # for each level. The value is a SimpleKey record:
  112. # (token_number, required, index, line, column, mark)
  113. # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
  114. # '[', or '{' tokens.
  115. self.possible_simple_keys: Dict[Any, Any] = {}
  116. self.yaml_version: Any = None
  117. self.tag_directives: List[Tuple[Any, Any]] = []
  118. @property
  119. def reader(self) -> Any:
  120. try:
  121. return self._scanner_reader # type: ignore
  122. except AttributeError:
  123. if hasattr(self.loader, 'typ'):
  124. self._scanner_reader = self.loader.reader
  125. else:
  126. self._scanner_reader = self.loader._reader
  127. return self._scanner_reader
  128. @property
  129. def scanner_processing_version(self) -> Any: # prefix until un-composited
  130. if hasattr(self.loader, 'typ'):
  131. return self.loader.resolver.processing_version
  132. return self.loader.processing_version
  133. # Public methods.
  134. def check_token(self, *choices: Any) -> bool:
  135. # Check if the next token is one of the given types.
  136. while self.need_more_tokens():
  137. self.fetch_more_tokens()
  138. if len(self.tokens) > 0:
  139. if not choices:
  140. return True
  141. for choice in choices:
  142. if isinstance(self.tokens[0], choice):
  143. return True
  144. return False
  145. def peek_token(self) -> Any:
  146. # Return the next token, but do not delete if from the queue.
  147. while self.need_more_tokens():
  148. self.fetch_more_tokens()
  149. if len(self.tokens) > 0:
  150. return self.tokens[0]
  151. def get_token(self) -> Any:
  152. # Return the next token.
  153. while self.need_more_tokens():
  154. self.fetch_more_tokens()
  155. if len(self.tokens) > 0:
  156. self.tokens_taken += 1
  157. return self.tokens.pop(0)
  158. # Private methods.
  159. def need_more_tokens(self) -> bool:
  160. if self.done:
  161. return False
  162. if len(self.tokens) == 0:
  163. return True
  164. # The current token may be a potential simple key, so we
  165. # need to look further.
  166. self.stale_possible_simple_keys()
  167. if self.next_possible_simple_key() == self.tokens_taken:
  168. return True
  169. return False
  170. def fetch_comment(self, comment: Any) -> None:
  171. raise NotImplementedError
  172. def fetch_more_tokens(self) -> Any:
  173. # Eat whitespaces and comments until we reach the next token.
  174. comment = self.scan_to_next_token()
  175. if comment is not None: # never happens for base scanner
  176. return self.fetch_comment(comment)
  177. # Remove obsolete possible simple keys.
  178. self.stale_possible_simple_keys()
  179. # Compare the current indentation and column. It may add some tokens
  180. # and decrease the current indentation level.
  181. self.unwind_indent(self.reader.column)
  182. # Peek the next character.
  183. ch = self.reader.peek()
  184. # Is it the end of stream?
  185. if ch == '\0':
  186. return self.fetch_stream_end()
  187. # Is it a directive?
  188. if ch == '%' and self.check_directive():
  189. return self.fetch_directive()
  190. # Is it the document start?
  191. if ch == '-' and self.check_document_start():
  192. return self.fetch_document_start()
  193. # Is it the document end?
  194. if ch == '.' and self.check_document_end():
  195. return self.fetch_document_end()
  196. # TODO: support for BOM within a stream.
  197. # if ch == '\uFEFF':
  198. # return self.fetch_bom() <-- issue BOMToken
  199. # Note: the order of the following checks is NOT significant.
  200. # Is it the flow sequence start indicator?
  201. if ch == '[':
  202. return self.fetch_flow_sequence_start()
  203. # Is it the flow mapping start indicator?
  204. if ch == '{':
  205. return self.fetch_flow_mapping_start()
  206. # Is it the flow sequence end indicator?
  207. if ch == ']':
  208. return self.fetch_flow_sequence_end()
  209. # Is it the flow mapping end indicator?
  210. if ch == '}':
  211. return self.fetch_flow_mapping_end()
  212. # Is it the flow entry indicator?
  213. if ch == ',':
  214. return self.fetch_flow_entry()
  215. # Is it the block entry indicator?
  216. if ch == '-' and self.check_block_entry():
  217. return self.fetch_block_entry()
  218. # Is it the key indicator?
  219. if ch == '?' and self.check_key():
  220. return self.fetch_key()
  221. # Is it the value indicator?
  222. if ch == ':' and self.check_value():
  223. return self.fetch_value()
  224. # Is it an alias?
  225. if ch == '*':
  226. return self.fetch_alias()
  227. # Is it an anchor?
  228. if ch == '&':
  229. return self.fetch_anchor()
  230. # Is it a tag?
  231. if ch == '!':
  232. return self.fetch_tag()
  233. # Is it a literal scalar?
  234. if ch == '|' and not self.flow_level:
  235. return self.fetch_literal()
  236. # Is it a folded scalar?
  237. if ch == '>' and not self.flow_level:
  238. return self.fetch_folded()
  239. # Is it a single quoted scalar?
  240. if ch == "'":
  241. return self.fetch_single()
  242. # Is it a double quoted scalar?
  243. if ch == '"':
  244. return self.fetch_double()
  245. # It must be a plain scalar then.
  246. if self.check_plain():
  247. return self.fetch_plain()
  248. # No? It's an error. Let's produce a nice error message.
  249. raise ScannerError(
  250. 'while scanning for the next token',
  251. None,
  252. f'found character {ch!r} that cannot start any token',
  253. self.reader.get_mark(),
  254. )
  255. # Simple keys treatment.
  256. def next_possible_simple_key(self) -> Any:
  257. # Return the number of the nearest possible simple key. Actually we
  258. # don't need to loop through the whole dictionary. We may replace it
  259. # with the following code:
  260. # if not self.possible_simple_keys:
  261. # return None
  262. # return self.possible_simple_keys[
  263. # min(self.possible_simple_keys.keys())].token_number
  264. min_token_number = None
  265. for level in self.possible_simple_keys:
  266. key = self.possible_simple_keys[level]
  267. if min_token_number is None or key.token_number < min_token_number:
  268. min_token_number = key.token_number
  269. return min_token_number
  270. def stale_possible_simple_keys(self) -> None:
  271. # Remove entries that are no longer possible simple keys. According to
  272. # the YAML specification, simple keys
  273. # - should be limited to a single line,
  274. # - should be no longer than 1024 characters.
  275. # Disabling this procedure will allow simple keys of any length and
  276. # height (may cause problems if indentation is broken though).
  277. for level in list(self.possible_simple_keys):
  278. key = self.possible_simple_keys[level]
  279. if key.line != self.reader.line or self.reader.index - key.index > 1024:
  280. if key.required:
  281. raise ScannerError(
  282. 'while scanning a simple key',
  283. key.mark,
  284. "could not find expected ':'",
  285. self.reader.get_mark(),
  286. )
  287. del self.possible_simple_keys[level]
  288. def save_possible_simple_key(self) -> None:
  289. # The next token may start a simple key. We check if it's possible
  290. # and save its position. This function is called for
  291. # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
  292. # Check if a simple key is required at the current position.
  293. required = not self.flow_level and self.indent == self.reader.column
  294. # The next token might be a simple key. Let's save it's number and
  295. # position.
  296. if self.allow_simple_key:
  297. self.remove_possible_simple_key()
  298. token_number = self.tokens_taken + len(self.tokens)
  299. key = SimpleKey(
  300. token_number,
  301. required,
  302. self.reader.index,
  303. self.reader.line,
  304. self.reader.column,
  305. self.reader.get_mark(),
  306. )
  307. self.possible_simple_keys[self.flow_level] = key
  308. def remove_possible_simple_key(self) -> None:
  309. # Remove the saved possible key position at the current flow level.
  310. if self.flow_level in self.possible_simple_keys:
  311. key = self.possible_simple_keys[self.flow_level]
  312. if key.required:
  313. raise ScannerError(
  314. 'while scanning a simple key',
  315. key.mark,
  316. "could not find expected ':'",
  317. self.reader.get_mark(),
  318. )
  319. del self.possible_simple_keys[self.flow_level]
  320. # Indentation functions.
  321. def unwind_indent(self, column: Any) -> None:
  322. # In flow context, tokens should respect indentation.
  323. # Actually the condition should be `self.indent >= column` according to
  324. # the spec. But this condition will prohibit intuitively correct
  325. # constructions such as
  326. # key : {
  327. # }
  328. # ####
  329. # if self.flow_level and self.indent > column:
  330. # raise ScannerError(None, None,
  331. # "invalid intendation or unclosed '[' or '{'",
  332. # self.reader.get_mark())
  333. # In the flow context, indentation is ignored. We make the scanner less
  334. # restrictive then specification requires.
  335. if bool(self.flow_level):
  336. return
  337. # In block context, we may need to issue the BLOCK-END tokens.
  338. while self.indent > column:
  339. mark = self.reader.get_mark()
  340. self.indent = self.indents.pop()
  341. self.tokens.append(BlockEndToken(mark, mark))
  342. def add_indent(self, column: int) -> bool:
  343. # Check if we need to increase indentation.
  344. if self.indent < column:
  345. self.indents.append(self.indent)
  346. self.indent = column
  347. return True
  348. return False
  349. # Fetchers.
  350. def fetch_stream_start(self) -> None:
  351. # We always add STREAM-START as the first token and STREAM-END as the
  352. # last token.
  353. # Read the token.
  354. mark = self.reader.get_mark()
  355. # Add STREAM-START.
  356. self.tokens.append(StreamStartToken(mark, mark, encoding=self.reader.encoding))
  357. def fetch_stream_end(self) -> None:
  358. # Set the current intendation to -1.
  359. self.unwind_indent(-1)
  360. # Reset simple keys.
  361. self.remove_possible_simple_key()
  362. self.allow_simple_key = False
  363. self.possible_simple_keys = {}
  364. # Read the token.
  365. mark = self.reader.get_mark()
  366. # Add STREAM-END.
  367. self.tokens.append(StreamEndToken(mark, mark))
  368. # The steam is finished.
  369. self.done = True
  370. def fetch_directive(self) -> None:
  371. # Set the current intendation to -1.
  372. self.unwind_indent(-1)
  373. # Reset simple keys.
  374. self.remove_possible_simple_key()
  375. self.allow_simple_key = False
  376. # Scan and add DIRECTIVE.
  377. self.tokens.append(self.scan_directive())
  378. def fetch_document_start(self) -> None:
  379. self.fetch_document_indicator(DocumentStartToken)
  380. def fetch_document_end(self) -> None:
  381. self.fetch_document_indicator(DocumentEndToken)
  382. def fetch_document_indicator(self, TokenClass: Any) -> None:
  383. # Set the current intendation to -1.
  384. self.unwind_indent(-1)
  385. # Reset simple keys. Note that there could not be a block collection
  386. # after '---'.
  387. self.remove_possible_simple_key()
  388. self.allow_simple_key = False
  389. # Add DOCUMENT-START or DOCUMENT-END.
  390. start_mark = self.reader.get_mark()
  391. self.reader.forward(3)
  392. end_mark = self.reader.get_mark()
  393. self.tokens.append(TokenClass(start_mark, end_mark))
  394. def fetch_flow_sequence_start(self) -> None:
  395. self.fetch_flow_collection_start(FlowSequenceStartToken, to_push='[')
  396. def fetch_flow_mapping_start(self) -> None:
  397. self.fetch_flow_collection_start(FlowMappingStartToken, to_push='{')
  398. def fetch_flow_collection_start(self, TokenClass: Any, to_push: Text) -> None:
  399. # '[' and '{' may start a simple key.
  400. self.save_possible_simple_key()
  401. # Increase the flow level.
  402. self.flow_context.append(to_push)
  403. # Simple keys are allowed after '[' and '{'.
  404. self.allow_simple_key = True
  405. # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
  406. start_mark = self.reader.get_mark()
  407. self.reader.forward()
  408. end_mark = self.reader.get_mark()
  409. self.tokens.append(TokenClass(start_mark, end_mark))
  410. def fetch_flow_sequence_end(self) -> None:
  411. self.fetch_flow_collection_end(FlowSequenceEndToken)
  412. def fetch_flow_mapping_end(self) -> None:
  413. self.fetch_flow_collection_end(FlowMappingEndToken)
  414. def fetch_flow_collection_end(self, TokenClass: Any) -> None:
  415. # Reset possible simple key on the current level.
  416. self.remove_possible_simple_key()
  417. # Decrease the flow level.
  418. try:
  419. popped = self.flow_context.pop() # NOQA
  420. except IndexError:
  421. # We must not be in a list or object.
  422. # Defer error handling to the parser.
  423. pass
  424. # No simple keys after ']' or '}'.
  425. self.allow_simple_key = False
  426. # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
  427. start_mark = self.reader.get_mark()
  428. self.reader.forward()
  429. end_mark = self.reader.get_mark()
  430. self.tokens.append(TokenClass(start_mark, end_mark))
  431. def fetch_flow_entry(self) -> None:
  432. # Simple keys are allowed after ','.
  433. self.allow_simple_key = True
  434. # Reset possible simple key on the current level.
  435. self.remove_possible_simple_key()
  436. # Add FLOW-ENTRY.
  437. start_mark = self.reader.get_mark()
  438. self.reader.forward()
  439. end_mark = self.reader.get_mark()
  440. self.tokens.append(FlowEntryToken(start_mark, end_mark))
  441. def fetch_block_entry(self) -> None:
  442. # Block context needs additional checks.
  443. if not self.flow_level:
  444. # Are we allowed to start a new entry?
  445. if not self.allow_simple_key:
  446. raise ScannerError(
  447. None,
  448. None,
  449. 'sequence entries are not allowed here',
  450. self.reader.get_mark(),
  451. )
  452. # We may need to add BLOCK-SEQUENCE-START.
  453. if self.add_indent(self.reader.column):
  454. mark = self.reader.get_mark()
  455. self.tokens.append(BlockSequenceStartToken(mark, mark))
  456. # It's an error for the block entry to occur in the flow context,
  457. # but we let the parser detect this.
  458. else:
  459. pass
  460. # Simple keys are allowed after '-'.
  461. self.allow_simple_key = True
  462. # Reset possible simple key on the current level.
  463. self.remove_possible_simple_key()
  464. # Add BLOCK-ENTRY.
  465. start_mark = self.reader.get_mark()
  466. self.reader.forward()
  467. end_mark = self.reader.get_mark()
  468. self.tokens.append(BlockEntryToken(start_mark, end_mark))
  469. def fetch_key(self) -> None:
  470. # Block context needs additional checks.
  471. if not self.flow_level:
  472. # Are we allowed to start a key (not nessesary a simple)?
  473. if not self.allow_simple_key:
  474. raise ScannerError(
  475. None, None, 'mapping keys are not allowed here', self.reader.get_mark(),
  476. )
  477. # We may need to add BLOCK-MAPPING-START.
  478. if self.add_indent(self.reader.column):
  479. mark = self.reader.get_mark()
  480. self.tokens.append(BlockMappingStartToken(mark, mark))
  481. # Simple keys are allowed after '?' in the block context.
  482. self.allow_simple_key = not self.flow_level
  483. # Reset possible simple key on the current level.
  484. self.remove_possible_simple_key()
  485. # Add KEY.
  486. start_mark = self.reader.get_mark()
  487. self.reader.forward()
  488. end_mark = self.reader.get_mark()
  489. self.tokens.append(KeyToken(start_mark, end_mark))
  490. def fetch_value(self) -> None:
  491. # Do we determine a simple key?
  492. if self.flow_level in self.possible_simple_keys:
  493. # Add KEY.
  494. key = self.possible_simple_keys[self.flow_level]
  495. del self.possible_simple_keys[self.flow_level]
  496. self.tokens.insert(
  497. key.token_number - self.tokens_taken, KeyToken(key.mark, key.mark),
  498. )
  499. # If this key starts a new block mapping, we need to add
  500. # BLOCK-MAPPING-START.
  501. if not self.flow_level:
  502. if self.add_indent(key.column):
  503. self.tokens.insert(
  504. key.token_number - self.tokens_taken,
  505. BlockMappingStartToken(key.mark, key.mark),
  506. )
  507. # There cannot be two simple keys one after another.
  508. self.allow_simple_key = False
  509. # It must be a part of a complex key.
  510. else:
  511. # Block context needs additional checks.
  512. # (Do we really need them? They will be caught by the parser
  513. # anyway.)
  514. if not self.flow_level:
  515. # We are allowed to start a complex value if and only if
  516. # we can start a simple key.
  517. if not self.allow_simple_key:
  518. raise ScannerError(
  519. None,
  520. None,
  521. 'mapping values are not allowed here',
  522. self.reader.get_mark(),
  523. )
  524. # If this value starts a new block mapping, we need to add
  525. # BLOCK-MAPPING-START. It will be detected as an error later by
  526. # the parser.
  527. if not self.flow_level:
  528. if self.add_indent(self.reader.column):
  529. mark = self.reader.get_mark()
  530. self.tokens.append(BlockMappingStartToken(mark, mark))
  531. # Simple keys are allowed after ':' in the block context.
  532. self.allow_simple_key = not self.flow_level
  533. # Reset possible simple key on the current level.
  534. self.remove_possible_simple_key()
  535. # Add VALUE.
  536. start_mark = self.reader.get_mark()
  537. self.reader.forward()
  538. end_mark = self.reader.get_mark()
  539. self.tokens.append(ValueToken(start_mark, end_mark))
  540. def fetch_alias(self) -> None:
  541. # ALIAS could be a simple key.
  542. self.save_possible_simple_key()
  543. # No simple keys after ALIAS.
  544. self.allow_simple_key = False
  545. # Scan and add ALIAS.
  546. self.tokens.append(self.scan_anchor(AliasToken))
  547. def fetch_anchor(self) -> None:
  548. # ANCHOR could start a simple key.
  549. self.save_possible_simple_key()
  550. # No simple keys after ANCHOR.
  551. self.allow_simple_key = False
  552. # Scan and add ANCHOR.
  553. self.tokens.append(self.scan_anchor(AnchorToken))
  554. def fetch_tag(self) -> None:
  555. # TAG could start a simple key.
  556. self.save_possible_simple_key()
  557. # No simple keys after TAG.
  558. self.allow_simple_key = False
  559. # Scan and add TAG.
  560. self.tokens.append(self.scan_tag())
  561. def fetch_literal(self) -> None:
  562. self.fetch_block_scalar(style='|')
  563. def fetch_folded(self) -> None:
  564. self.fetch_block_scalar(style='>')
  565. def fetch_block_scalar(self, style: Any) -> None:
  566. # A simple key may follow a block scalar.
  567. self.allow_simple_key = True
  568. # Reset possible simple key on the current level.
  569. self.remove_possible_simple_key()
  570. # Scan and add SCALAR.
  571. self.tokens.append(self.scan_block_scalar(style))
  572. def fetch_single(self) -> None:
  573. self.fetch_flow_scalar(style="'")
  574. def fetch_double(self) -> None:
  575. self.fetch_flow_scalar(style='"')
  576. def fetch_flow_scalar(self, style: Any) -> None:
  577. # A flow scalar could be a simple key.
  578. self.save_possible_simple_key()
  579. # No simple keys after flow scalars.
  580. self.allow_simple_key = False
  581. # Scan and add SCALAR.
  582. self.tokens.append(self.scan_flow_scalar(style))
  583. def fetch_plain(self) -> None:
  584. # A plain scalar could be a simple key.
  585. self.save_possible_simple_key()
  586. # No simple keys after plain scalars. But note that `scan_plain` will
  587. # change this flag if the scan is finished at the beginning of the
  588. # line.
  589. self.allow_simple_key = False
  590. # Scan and add SCALAR. May change `allow_simple_key`.
  591. self.tokens.append(self.scan_plain())
  592. # Checkers.
  593. def check_directive(self) -> Any:
  594. # DIRECTIVE: ^ '%' ...
  595. # The '%' indicator is already checked.
  596. if self.reader.column == 0:
  597. return True
  598. return None
  599. def check_document_start(self) -> Any:
  600. # DOCUMENT-START: ^ '---' (' '|'\n')
  601. if self.reader.column == 0:
  602. if self.reader.prefix(3) == '---' and self.reader.peek(3) in _THE_END_SPACE_TAB:
  603. return True
  604. return None
  605. def check_document_end(self) -> Any:
  606. # DOCUMENT-END: ^ '...' (' '|'\n')
  607. if self.reader.column == 0:
  608. if self.reader.prefix(3) == '...' and self.reader.peek(3) in _THE_END_SPACE_TAB:
  609. return True
  610. return None
  611. def check_block_entry(self) -> Any:
  612. # BLOCK-ENTRY: '-' (' '|'\n')
  613. return self.reader.peek(1) in _THE_END_SPACE_TAB
  614. def check_key(self) -> Any:
  615. # KEY(flow context): '?'
  616. if bool(self.flow_level):
  617. return True
  618. # KEY(block context): '?' (' '|'\n')
  619. return self.reader.peek(1) in _THE_END_SPACE_TAB
  620. def check_value(self) -> Any:
  621. # VALUE(flow context): ':'
  622. if self.scanner_processing_version == (1, 1):
  623. if bool(self.flow_level):
  624. return True
  625. else:
  626. if bool(self.flow_level):
  627. if self.flow_context[-1] == '[':
  628. if self.reader.peek(1) not in _THE_END_SPACE_TAB:
  629. return False
  630. elif self.tokens and isinstance(self.tokens[-1], ValueToken):
  631. # mapping flow context scanning a value token
  632. if self.reader.peek(1) not in _THE_END_SPACE_TAB:
  633. return False
  634. return True
  635. # VALUE(block context): ':' (' '|'\n')
  636. return self.reader.peek(1) in _THE_END_SPACE_TAB
  637. def check_plain(self) -> Any:
  638. # A plain scalar may start with any non-space character except:
  639. # '-', '?', ':', ',', '[', ']', '{', '}',
  640. # '#', '&', '*', '!', '|', '>', '\'', '\"',
  641. # '%', '@', '`'.
  642. #
  643. # It may also start with
  644. # '-', '?', ':'
  645. # if it is followed by a non-space character.
  646. #
  647. # Note that we limit the last rule to the block context (except the
  648. # '-' character) because we want the flow context to be space
  649. # independent.
  650. srp = self.reader.peek
  651. ch = srp()
  652. if self.scanner_processing_version == (1, 1):
  653. return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`' or (
  654. srp(1) not in _THE_END_SPACE_TAB
  655. and (ch == '-' or (not self.flow_level and ch in '?:'))
  656. )
  657. # YAML 1.2
  658. if ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`':
  659. # ################### ^ ???
  660. return True
  661. ch1 = srp(1)
  662. if ch == '-' and ch1 not in _THE_END_SPACE_TAB:
  663. return True
  664. if ch == ':' and bool(self.flow_level) and ch1 not in _SPACE_TAB:
  665. return True
  666. return srp(1) not in _THE_END_SPACE_TAB and (
  667. ch == '-' or (not self.flow_level and ch in '?:')
  668. )
  669. # Scanners.
  670. def scan_to_next_token(self) -> Any:
  671. # We ignore spaces, line breaks and comments.
  672. # If we find a line break in the block context, we set the flag
  673. # `allow_simple_key` on.
  674. # The byte order mark is stripped if it's the first character in the
  675. # stream. We do not yet support BOM inside the stream as the
  676. # specification requires. Any such mark will be considered as a part
  677. # of the document.
  678. #
  679. # TODO: We need to make tab handling rules more sane. A good rule is
  680. # Tabs cannot precede tokens
  681. # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
  682. # KEY(block), VALUE(block), BLOCK-ENTRY
  683. # So the checking code is
  684. # if <TAB>:
  685. # self.allow_simple_keys = False
  686. # We also need to add the check for `allow_simple_keys == True` to
  687. # `unwind_indent` before issuing BLOCK-END.
  688. # Scanners for block, flow, and plain scalars need to be modified.
  689. srp = self.reader.peek
  690. srf = self.reader.forward
  691. if self.reader.index == 0 and srp() == '\uFEFF':
  692. srf()
  693. found = False
  694. _the_end = _THE_END
  695. white_space = ' \t' if self.flow_level > 0 else ' '
  696. while not found:
  697. while srp() in white_space:
  698. srf()
  699. if srp() == '#':
  700. while srp() not in _the_end:
  701. srf()
  702. if self.scan_line_break():
  703. if not self.flow_level:
  704. self.allow_simple_key = True
  705. else:
  706. found = True
  707. return None
  708. def scan_directive(self) -> Any:
  709. # See the specification for details.
  710. srp = self.reader.peek
  711. srf = self.reader.forward
  712. start_mark = self.reader.get_mark()
  713. srf()
  714. name = self.scan_directive_name(start_mark)
  715. value = None
  716. if name == 'YAML':
  717. value = self.scan_yaml_directive_value(start_mark)
  718. end_mark = self.reader.get_mark()
  719. elif name == 'TAG':
  720. value = self.scan_tag_directive_value(start_mark)
  721. end_mark = self.reader.get_mark()
  722. else:
  723. end_mark = self.reader.get_mark()
  724. while srp() not in _THE_END:
  725. srf()
  726. self.scan_directive_ignored_line(start_mark)
  727. return DirectiveToken(name, value, start_mark, end_mark)
  728. def scan_directive_name(self, start_mark: Any) -> Any:
  729. # See the specification for details.
  730. length = 0
  731. srp = self.reader.peek
  732. ch = srp(length)
  733. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_:.':
  734. length += 1
  735. ch = srp(length)
  736. if not length:
  737. raise ScannerError(
  738. 'while scanning a directive',
  739. start_mark,
  740. f'expected alphabetic or numeric character, but found {ch!r}',
  741. self.reader.get_mark(),
  742. )
  743. value = self.reader.prefix(length)
  744. self.reader.forward(length)
  745. ch = srp()
  746. if ch not in '\0 \r\n\x85\u2028\u2029':
  747. raise ScannerError(
  748. 'while scanning a directive',
  749. start_mark,
  750. f'expected alphabetic or numeric character, but found {ch!r}',
  751. self.reader.get_mark(),
  752. )
  753. return value
  754. def scan_yaml_directive_value(self, start_mark: Any) -> Any:
  755. # See the specification for details.
  756. srp = self.reader.peek
  757. srf = self.reader.forward
  758. while srp() == ' ':
  759. srf()
  760. major = self.scan_yaml_directive_number(start_mark)
  761. if srp() != '.':
  762. raise ScannerError(
  763. 'while scanning a directive',
  764. start_mark,
  765. f"expected a digit or '.', but found {srp()!r}",
  766. self.reader.get_mark(),
  767. )
  768. srf()
  769. minor = self.scan_yaml_directive_number(start_mark)
  770. if srp() not in '\0 \r\n\x85\u2028\u2029':
  771. raise ScannerError(
  772. 'while scanning a directive',
  773. start_mark,
  774. f"expected a digit or '.', but found {srp()!r}",
  775. self.reader.get_mark(),
  776. )
  777. self.yaml_version = (major, minor)
  778. self.loader.doc_infos[-1].doc_version = Version(major, minor)
  779. return self.yaml_version
  780. def scan_yaml_directive_number(self, start_mark: Any) -> Any:
  781. # See the specification for details.
  782. srp = self.reader.peek
  783. srf = self.reader.forward
  784. ch = srp()
  785. if not ('0' <= ch <= '9'):
  786. raise ScannerError(
  787. 'while scanning a directive',
  788. start_mark,
  789. f'expected a digit, but found {ch!r}',
  790. self.reader.get_mark(),
  791. )
  792. length = 0
  793. while '0' <= srp(length) <= '9':
  794. length += 1
  795. value = int(self.reader.prefix(length))
  796. srf(length)
  797. return value
  798. def scan_tag_directive_value(self, start_mark: Any) -> Any:
  799. # See the specification for details.
  800. srp = self.reader.peek
  801. srf = self.reader.forward
  802. while srp() == ' ':
  803. srf()
  804. handle = self.scan_tag_directive_handle(start_mark)
  805. while srp() == ' ':
  806. srf()
  807. prefix = self.scan_tag_directive_prefix(start_mark)
  808. ret_val = (handle, prefix)
  809. self.tag_directives.append(ret_val)
  810. return ret_val
  811. def scan_tag_directive_handle(self, start_mark: Any) -> Any:
  812. # See the specification for details.
  813. value = self.scan_tag_handle('directive', start_mark)
  814. ch = self.reader.peek()
  815. if ch != ' ':
  816. raise ScannerError(
  817. 'while scanning a directive',
  818. start_mark,
  819. f"expected ' ', but found {ch!r}",
  820. self.reader.get_mark(),
  821. )
  822. return value
  823. def scan_tag_directive_prefix(self, start_mark: Any) -> Any:
  824. # See the specification for details.
  825. value = self.scan_tag_uri('directive', start_mark)
  826. ch = self.reader.peek()
  827. if ch not in '\0 \r\n\x85\u2028\u2029':
  828. raise ScannerError(
  829. 'while scanning a directive',
  830. start_mark,
  831. f"expected ' ', but found {ch!r}",
  832. self.reader.get_mark(),
  833. )
  834. return value
  835. def scan_directive_ignored_line(self, start_mark: Any) -> None:
  836. # See the specification for details.
  837. srp = self.reader.peek
  838. srf = self.reader.forward
  839. while srp() == ' ':
  840. srf()
  841. if srp() == '#':
  842. while srp() not in _THE_END:
  843. srf()
  844. ch = srp()
  845. if ch not in _THE_END:
  846. raise ScannerError(
  847. 'while scanning a directive',
  848. start_mark,
  849. f'expected a comment or a line break, but found {ch!r}',
  850. self.reader.get_mark(),
  851. )
  852. self.scan_line_break()
  853. def scan_anchor(self, TokenClass: Any) -> Any:
  854. # The specification does not restrict characters for anchors and
  855. # aliases. This may lead to problems, for instance, the document:
  856. # [ *alias, value ]
  857. # can be interpteted in two ways, as
  858. # [ "value" ]
  859. # and
  860. # [ *alias , "value" ]
  861. # Therefore we restrict aliases to numbers and ASCII letters.
  862. srp = self.reader.peek
  863. start_mark = self.reader.get_mark()
  864. indicator = srp()
  865. if indicator == '*':
  866. name = 'alias'
  867. else:
  868. name = 'anchor'
  869. self.reader.forward()
  870. length = 0
  871. ch = srp(length)
  872. # while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  873. # or ch in '-_':
  874. while check_anchorname_char(ch):
  875. length += 1
  876. ch = srp(length)
  877. if not length:
  878. raise ScannerError(
  879. f'while scanning an {name!s}',
  880. start_mark,
  881. f'expected alphabetic or numeric character, but found {ch!r}',
  882. self.reader.get_mark(),
  883. )
  884. value = self.reader.prefix(length)
  885. self.reader.forward(length)
  886. # ch1 = ch
  887. # ch = srp() # no need to peek, ch is already set
  888. # assert ch1 == ch
  889. if ch not in '\0 \t\r\n\x85\u2028\u2029?:,[]{}%@`':
  890. raise ScannerError(
  891. f'while scanning an {name!s}',
  892. start_mark,
  893. f'expected alphabetic or numeric character, but found {ch!r}',
  894. self.reader.get_mark(),
  895. )
  896. end_mark = self.reader.get_mark()
  897. return TokenClass(value, start_mark, end_mark)
  898. def scan_tag(self) -> Any:
  899. # See the specification for details.
  900. srp = self.reader.peek
  901. start_mark = self.reader.get_mark()
  902. ch = srp(1)
  903. short_handle = '!'
  904. if ch == '!':
  905. short_handle = '!!'
  906. self.reader.forward()
  907. srp = self.reader.peek
  908. ch = srp(1)
  909. if ch == '<':
  910. handle = None
  911. self.reader.forward(2)
  912. suffix = self.scan_tag_uri('tag', start_mark)
  913. if srp() != '>':
  914. raise ScannerError(
  915. 'while parsing a tag',
  916. start_mark,
  917. f"expected '>' but found {srp()!r}",
  918. self.reader.get_mark(),
  919. )
  920. self.reader.forward()
  921. elif ch in _THE_END_SPACE_TAB:
  922. handle = None
  923. suffix = short_handle
  924. self.reader.forward()
  925. else:
  926. length = 1
  927. use_handle = False
  928. while ch not in '\0 \r\n\x85\u2028\u2029':
  929. if ch == '!':
  930. use_handle = True
  931. break
  932. length += 1
  933. ch = srp(length)
  934. handle = short_handle
  935. if use_handle:
  936. handle = self.scan_tag_handle('tag', start_mark)
  937. else:
  938. handle = short_handle
  939. self.reader.forward()
  940. suffix = self.scan_tag_uri('tag', start_mark)
  941. ch = srp()
  942. if ch not in '\0 \r\n\x85\u2028\u2029':
  943. raise ScannerError(
  944. 'while scanning a tag',
  945. start_mark,
  946. f"expected ' ', but found {ch!r}",
  947. self.reader.get_mark(),
  948. )
  949. value = (handle, suffix)
  950. end_mark = self.reader.get_mark()
  951. return TagToken(value, start_mark, end_mark)
  952. def scan_block_scalar(self, style: Any, rt: Optional[bool] = False) -> Any:
  953. # See the specification for details.
  954. srp = self.reader.peek
  955. if style == '>':
  956. folded = True
  957. else:
  958. folded = False
  959. chunks: List[Any] = []
  960. start_mark = self.reader.get_mark()
  961. # Scan the header.
  962. self.reader.forward()
  963. chomping, increment = self.scan_block_scalar_indicators(start_mark)
  964. # block scalar comment e.g. : |+ # comment text
  965. block_scalar_comment = self.scan_block_scalar_ignored_line(start_mark)
  966. # Determine the indentation level and go to the first non-empty line.
  967. min_indent = self.indent + 1
  968. if increment is None:
  969. # no increment and top level, min_indent could be 0
  970. if min_indent < 1 and (
  971. style not in '|>'
  972. or (self.scanner_processing_version == (1, 1))
  973. and getattr(
  974. self.loader, 'top_level_block_style_scalar_no_indent_error_1_1', False,
  975. )
  976. ):
  977. min_indent = 1
  978. breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
  979. indent = max(min_indent, max_indent)
  980. else:
  981. if min_indent < 1:
  982. min_indent = 1
  983. indent = min_indent + increment - 1
  984. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  985. line_break = ""
  986. # Scan the inner part of the block scalar.
  987. while self.reader.column == indent and srp() != '\0':
  988. chunks.extend(breaks)
  989. leading_non_space = srp() not in ' \t'
  990. length = 0
  991. while srp(length) not in _THE_END:
  992. length += 1
  993. chunks.append(self.reader.prefix(length))
  994. self.reader.forward(length)
  995. line_break = self.scan_line_break()
  996. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  997. if style in '|>' and min_indent == 0:
  998. # at the beginning of a line, if in block style see if
  999. # end of document/start_new_document
  1000. if self.check_document_start() or self.check_document_end():
  1001. break
  1002. if self.reader.column == indent and srp() != '\0':
  1003. # Unfortunately, folding rules are ambiguous.
  1004. #
  1005. # This is the folding according to the specification:
  1006. if rt and folded and line_break == '\n':
  1007. chunks.append('\a')
  1008. if folded and line_break == '\n' and leading_non_space and srp() not in ' \t':
  1009. if not breaks:
  1010. chunks.append(' ')
  1011. else:
  1012. chunks.append(line_break)
  1013. # This is Clark Evans's interpretation (also in the spec
  1014. # examples):
  1015. #
  1016. # if folded and line_break == '\n':
  1017. # if not breaks:
  1018. # if srp() not in ' \t':
  1019. # chunks.append(' ')
  1020. # else:
  1021. # chunks.append(line_break)
  1022. # else:
  1023. # chunks.append(line_break)
  1024. else:
  1025. break
  1026. # Process trailing line breaks. The 'chomping' setting determines
  1027. # whether they are included in the value.
  1028. trailing: List[Any] = []
  1029. if chomping in [None, True]:
  1030. chunks.append(line_break)
  1031. if chomping is True:
  1032. chunks.extend(breaks)
  1033. elif chomping in [None, False]:
  1034. trailing.extend(breaks)
  1035. # We are done.
  1036. token = ScalarToken("".join(chunks), False, start_mark, end_mark, style)
  1037. if self.loader is not None:
  1038. comment_handler = getattr(self.loader, 'comment_handling', False)
  1039. if comment_handler is None:
  1040. if block_scalar_comment is not None:
  1041. token.add_pre_comments([block_scalar_comment])
  1042. if len(trailing) > 0:
  1043. # Eat whitespaces and comments until we reach the next token.
  1044. if self.loader is not None:
  1045. comment_handler = getattr(self.loader, 'comment_handling', None)
  1046. if comment_handler is not None:
  1047. line = end_mark.line - len(trailing)
  1048. for x in trailing:
  1049. assert x[-1] == '\n'
  1050. self.comments.add_blank_line(x, 0, line) # type: ignore
  1051. line += 1
  1052. comment = self.scan_to_next_token()
  1053. while comment:
  1054. trailing.append(' ' * comment[1].column + comment[0])
  1055. comment = self.scan_to_next_token()
  1056. if self.loader is not None:
  1057. comment_handler = getattr(self.loader, 'comment_handling', False)
  1058. if comment_handler is None:
  1059. # Keep track of the trailing whitespace and following comments
  1060. # as a comment token, if isn't all included in the actual value.
  1061. comment_end_mark = self.reader.get_mark()
  1062. comment = CommentToken("".join(trailing), end_mark, comment_end_mark)
  1063. token.add_post_comment(comment)
  1064. return token
  1065. def scan_block_scalar_indicators(self, start_mark: Any) -> Any:
  1066. # See the specification for details.
  1067. srp = self.reader.peek
  1068. chomping = None
  1069. increment = None
  1070. ch = srp()
  1071. if ch in '+-':
  1072. if ch == '+':
  1073. chomping = True
  1074. else:
  1075. chomping = False
  1076. self.reader.forward()
  1077. ch = srp()
  1078. if ch in '0123456789':
  1079. increment = int(ch)
  1080. if increment == 0:
  1081. raise ScannerError(
  1082. 'while scanning a block scalar',
  1083. start_mark,
  1084. 'expected indentation indicator in the range 1-9, ' 'but found 0',
  1085. self.reader.get_mark(),
  1086. )
  1087. self.reader.forward()
  1088. elif ch in '0123456789':
  1089. increment = int(ch)
  1090. if increment == 0:
  1091. raise ScannerError(
  1092. 'while scanning a block scalar',
  1093. start_mark,
  1094. 'expected indentation indicator in the range 1-9, ' 'but found 0',
  1095. self.reader.get_mark(),
  1096. )
  1097. self.reader.forward()
  1098. ch = srp()
  1099. if ch in '+-':
  1100. if ch == '+':
  1101. chomping = True
  1102. else:
  1103. chomping = False
  1104. self.reader.forward()
  1105. ch = srp()
  1106. if ch not in '\0 \r\n\x85\u2028\u2029':
  1107. raise ScannerError(
  1108. 'while scanning a block scalar',
  1109. start_mark,
  1110. f'expected chomping or indentation indicators, but found {ch!r}',
  1111. self.reader.get_mark(),
  1112. )
  1113. return chomping, increment
  1114. def scan_block_scalar_ignored_line(self, start_mark: Any) -> Any:
  1115. # See the specification for details.
  1116. srp = self.reader.peek
  1117. srf = self.reader.forward
  1118. prefix = ''
  1119. comment = None
  1120. while srp() == ' ':
  1121. prefix += srp()
  1122. srf()
  1123. if srp() == '#':
  1124. comment = prefix
  1125. while srp() not in _THE_END:
  1126. comment += srp()
  1127. srf()
  1128. ch = srp()
  1129. if ch not in _THE_END:
  1130. raise ScannerError(
  1131. 'while scanning a block scalar',
  1132. start_mark,
  1133. f'expected a comment or a line break, but found {ch!r}',
  1134. self.reader.get_mark(),
  1135. )
  1136. self.scan_line_break()
  1137. return comment
  1138. def scan_block_scalar_indentation(self) -> Any:
  1139. # See the specification for details.
  1140. srp = self.reader.peek
  1141. srf = self.reader.forward
  1142. chunks = []
  1143. first_indent = -1
  1144. max_indent = 0
  1145. end_mark = self.reader.get_mark()
  1146. while srp() in ' \r\n\x85\u2028\u2029':
  1147. if srp() != ' ':
  1148. if first_indent < 0:
  1149. first_indent = self.reader.column
  1150. chunks.append(self.scan_line_break())
  1151. end_mark = self.reader.get_mark()
  1152. else:
  1153. srf()
  1154. if self.reader.column > max_indent:
  1155. max_indent = self.reader.column
  1156. if first_indent > 0 and max_indent > first_indent:
  1157. start_mark = self.reader.get_mark()
  1158. raise ScannerError(
  1159. 'more indented follow up line than first in a block scalar', start_mark,
  1160. )
  1161. return chunks, max_indent, end_mark
  1162. def scan_block_scalar_breaks(self, indent: int) -> Any:
  1163. # See the specification for details.
  1164. chunks = []
  1165. srp = self.reader.peek
  1166. srf = self.reader.forward
  1167. end_mark = self.reader.get_mark()
  1168. while self.reader.column < indent and srp() == ' ':
  1169. srf()
  1170. while srp() in '\r\n\x85\u2028\u2029':
  1171. chunks.append(self.scan_line_break())
  1172. end_mark = self.reader.get_mark()
  1173. while self.reader.column < indent and srp() == ' ':
  1174. srf()
  1175. return chunks, end_mark
  1176. def scan_flow_scalar(self, style: Any) -> Any:
  1177. # See the specification for details.
  1178. # Note that we loose indentation rules for quoted scalars. Quoted
  1179. # scalars don't need to adhere indentation because " and ' clearly
  1180. # mark the beginning and the end of them. Therefore we are less
  1181. # restrictive then the specification requires. We only need to check
  1182. # that document separators are not included in scalars.
  1183. if style == '"':
  1184. double = True
  1185. else:
  1186. double = False
  1187. srp = self.reader.peek
  1188. chunks: List[Any] = []
  1189. start_mark = self.reader.get_mark()
  1190. quote = srp()
  1191. self.reader.forward()
  1192. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  1193. while srp() != quote:
  1194. chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
  1195. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  1196. self.reader.forward()
  1197. end_mark = self.reader.get_mark()
  1198. return ScalarToken("".join(chunks), False, start_mark, end_mark, style)
  1199. ESCAPE_REPLACEMENTS = {
  1200. '0': '\0',
  1201. 'a': '\x07',
  1202. 'b': '\x08',
  1203. 't': '\x09',
  1204. '\t': '\x09',
  1205. 'n': '\x0A',
  1206. 'v': '\x0B',
  1207. 'f': '\x0C',
  1208. 'r': '\x0D',
  1209. 'e': '\x1B',
  1210. ' ': '\x20',
  1211. '"': '"',
  1212. '/': '/', # as per http://www.json.org/
  1213. '\\': '\\',
  1214. 'N': '\x85',
  1215. '_': '\xA0',
  1216. 'L': '\u2028',
  1217. 'P': '\u2029',
  1218. }
  1219. ESCAPE_CODES = {'x': 2, 'u': 4, 'U': 8}
  1220. def scan_flow_scalar_non_spaces(self, double: Any, start_mark: Any) -> Any:
  1221. # See the specification for details.
  1222. chunks: List[Any] = []
  1223. srp = self.reader.peek
  1224. srf = self.reader.forward
  1225. while True:
  1226. length = 0
  1227. while srp(length) not in ' \n\'"\\\0\t\r\x85\u2028\u2029':
  1228. length += 1
  1229. if length != 0:
  1230. chunks.append(self.reader.prefix(length))
  1231. srf(length)
  1232. ch = srp()
  1233. if not double and ch == "'" and srp(1) == "'":
  1234. chunks.append("'")
  1235. srf(2)
  1236. elif (double and ch == "'") or (not double and ch in '"\\'):
  1237. chunks.append(ch)
  1238. srf()
  1239. elif double and ch == '\\':
  1240. srf()
  1241. ch = srp()
  1242. if ch in self.ESCAPE_REPLACEMENTS:
  1243. chunks.append(self.ESCAPE_REPLACEMENTS[ch])
  1244. srf()
  1245. elif ch in self.ESCAPE_CODES:
  1246. length = self.ESCAPE_CODES[ch]
  1247. srf()
  1248. for k in range(length):
  1249. if srp(k) not in '0123456789ABCDEFabcdef':
  1250. raise ScannerError(
  1251. 'while scanning a double-quoted scalar',
  1252. start_mark,
  1253. f'expected escape sequence of {length:d} '
  1254. f'hexdecimal numbers, but found {srp(k)!r}',
  1255. self.reader.get_mark(),
  1256. )
  1257. code = int(self.reader.prefix(length), 16)
  1258. chunks.append(chr(code))
  1259. srf(length)
  1260. elif ch in '\n\r\x85\u2028\u2029':
  1261. self.scan_line_break()
  1262. chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
  1263. else:
  1264. raise ScannerError(
  1265. 'while scanning a double-quoted scalar',
  1266. start_mark,
  1267. f'found unknown escape character {ch!r}',
  1268. self.reader.get_mark(),
  1269. )
  1270. else:
  1271. return chunks
  1272. def scan_flow_scalar_spaces(self, double: Any, start_mark: Any) -> Any:
  1273. # See the specification for details.
  1274. srp = self.reader.peek
  1275. chunks = []
  1276. length = 0
  1277. while srp(length) in ' \t':
  1278. length += 1
  1279. whitespaces = self.reader.prefix(length)
  1280. self.reader.forward(length)
  1281. ch = srp()
  1282. if ch == '\0':
  1283. raise ScannerError(
  1284. 'while scanning a quoted scalar',
  1285. start_mark,
  1286. 'found unexpected end of stream',
  1287. self.reader.get_mark(),
  1288. )
  1289. elif ch in '\r\n\x85\u2028\u2029':
  1290. line_break = self.scan_line_break()
  1291. breaks = self.scan_flow_scalar_breaks(double, start_mark)
  1292. if line_break != '\n':
  1293. chunks.append(line_break)
  1294. elif not breaks:
  1295. chunks.append(' ')
  1296. chunks.extend(breaks)
  1297. else:
  1298. chunks.append(whitespaces)
  1299. return chunks
  1300. def scan_flow_scalar_breaks(self, double: Any, start_mark: Any) -> Any:
  1301. # See the specification for details.
  1302. chunks: List[Any] = []
  1303. srp = self.reader.peek
  1304. srf = self.reader.forward
  1305. while True:
  1306. # Instead of checking indentation, we check for document
  1307. # separators.
  1308. prefix = self.reader.prefix(3)
  1309. if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
  1310. raise ScannerError(
  1311. 'while scanning a quoted scalar',
  1312. start_mark,
  1313. 'found unexpected document separator',
  1314. self.reader.get_mark(),
  1315. )
  1316. while srp() in ' \t':
  1317. srf()
  1318. if srp() in '\r\n\x85\u2028\u2029':
  1319. chunks.append(self.scan_line_break())
  1320. else:
  1321. return chunks
  1322. def scan_plain(self) -> Any:
  1323. # See the specification for details.
  1324. # We add an additional restriction for the flow context:
  1325. # plain scalars in the flow context cannot contain ',', ': ' and '?'.
  1326. # We also keep track of the `allow_simple_key` flag here.
  1327. # Indentation rules are loosed for the flow context.
  1328. srp = self.reader.peek
  1329. srf = self.reader.forward
  1330. chunks: List[Any] = []
  1331. start_mark = self.reader.get_mark()
  1332. end_mark = start_mark
  1333. indent = self.indent + 1
  1334. # We allow zero indentation for scalars, but then we need to check for
  1335. # document separators at the beginning of the line.
  1336. # if indent == 0:
  1337. # indent = 1
  1338. spaces: List[Any] = []
  1339. while True:
  1340. length = 0
  1341. if srp() == '#':
  1342. break
  1343. while True:
  1344. ch = srp(length)
  1345. if False and ch == ':' and srp(length + 1) == ',':
  1346. break
  1347. elif ch == ':' and srp(length + 1) not in _THE_END_SPACE_TAB:
  1348. pass
  1349. elif ch == '?' and self.scanner_processing_version != (1, 1):
  1350. pass
  1351. elif (
  1352. ch in _THE_END_SPACE_TAB
  1353. or (
  1354. not self.flow_level
  1355. and ch == ':'
  1356. and srp(length + 1) in _THE_END_SPACE_TAB
  1357. )
  1358. or (self.flow_level and ch in ',:?[]{}')
  1359. ):
  1360. break
  1361. length += 1
  1362. # It's not clear what we should do with ':' in the flow context.
  1363. if (
  1364. self.flow_level
  1365. and ch == ':'
  1366. and srp(length + 1) not in '\0 \t\r\n\x85\u2028\u2029,[]{}'
  1367. ):
  1368. srf(length)
  1369. raise ScannerError(
  1370. 'while scanning a plain scalar',
  1371. start_mark,
  1372. "found unexpected ':'",
  1373. self.reader.get_mark(),
  1374. 'Please check '
  1375. 'http://pyyaml.org/wiki/YAMLColonInFlowContext '
  1376. 'for details.',
  1377. )
  1378. if length == 0:
  1379. break
  1380. self.allow_simple_key = False
  1381. chunks.extend(spaces)
  1382. chunks.append(self.reader.prefix(length))
  1383. srf(length)
  1384. end_mark = self.reader.get_mark()
  1385. spaces = self.scan_plain_spaces(indent, start_mark)
  1386. if (
  1387. not spaces
  1388. or srp() == '#'
  1389. or (not self.flow_level and self.reader.column < indent)
  1390. ):
  1391. break
  1392. token = ScalarToken("".join(chunks), True, start_mark, end_mark)
  1393. # getattr provides True so C type loader, which cannot handle comment,
  1394. # will not make CommentToken
  1395. if self.loader is not None:
  1396. comment_handler = getattr(self.loader, 'comment_handling', False)
  1397. if comment_handler is None:
  1398. if spaces and spaces[0] == '\n':
  1399. # Create a comment token to preserve the trailing line breaks.
  1400. comment = CommentToken("".join(spaces) + '\n', start_mark, end_mark)
  1401. token.add_post_comment(comment)
  1402. elif comment_handler is not False:
  1403. line = start_mark.line + 1
  1404. for ch in spaces:
  1405. if ch == '\n':
  1406. self.comments.add_blank_line('\n', 0, line) # type: ignore
  1407. line += 1
  1408. return token
  1409. def scan_plain_spaces(self, indent: Any, start_mark: Any) -> Any:
  1410. # See the specification for details.
  1411. # The specification is really confusing about tabs in plain scalars.
  1412. # We just forbid them completely. Do not use tabs in YAML!
  1413. srp = self.reader.peek
  1414. srf = self.reader.forward
  1415. chunks = []
  1416. length = 0
  1417. while srp(length) in ' ':
  1418. length += 1
  1419. whitespaces = self.reader.prefix(length)
  1420. self.reader.forward(length)
  1421. ch = srp()
  1422. if ch in '\r\n\x85\u2028\u2029':
  1423. line_break = self.scan_line_break()
  1424. self.allow_simple_key = True
  1425. prefix = self.reader.prefix(3)
  1426. if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
  1427. return
  1428. breaks = []
  1429. while srp() in ' \r\n\x85\u2028\u2029':
  1430. if srp() == ' ':
  1431. srf()
  1432. else:
  1433. breaks.append(self.scan_line_break())
  1434. prefix = self.reader.prefix(3)
  1435. if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
  1436. return
  1437. if line_break != '\n':
  1438. chunks.append(line_break)
  1439. elif not breaks:
  1440. chunks.append(' ')
  1441. chunks.extend(breaks)
  1442. elif whitespaces:
  1443. chunks.append(whitespaces)
  1444. return chunks
  1445. def scan_tag_handle(self, name: Any, start_mark: Any) -> Any:
  1446. # See the specification for details.
  1447. # For some strange reasons, the specification does not allow '_' in
  1448. # tag handles. I have allowed it anyway.
  1449. srp = self.reader.peek
  1450. ch = srp()
  1451. if ch != '!':
  1452. raise ScannerError(
  1453. f'while scanning an {name!s}',
  1454. start_mark,
  1455. f"expected '!', but found {ch!r}",
  1456. self.reader.get_mark(),
  1457. )
  1458. length = 1
  1459. ch = srp(length)
  1460. if ch != ' ':
  1461. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_':
  1462. length += 1
  1463. ch = srp(length)
  1464. if ch != '!':
  1465. self.reader.forward(length)
  1466. raise ScannerError(
  1467. f'while scanning an {name!s}',
  1468. start_mark,
  1469. f"expected '!' but found {ch!r}",
  1470. self.reader.get_mark(),
  1471. )
  1472. length += 1
  1473. value = self.reader.prefix(length)
  1474. self.reader.forward(length)
  1475. return value
  1476. def scan_tag_uri(self, name: Any, start_mark: Any) -> Any:
  1477. # See the specification for details.
  1478. # Note: we do not check if URI is well-formed.
  1479. srp = self.reader.peek
  1480. chunks = []
  1481. length = 0
  1482. ch = srp(length)
  1483. while (
  1484. '0' <= ch <= '9'
  1485. or 'A' <= ch <= 'Z'
  1486. or 'a' <= ch <= 'z'
  1487. or ch in "-;/?:@&=+$,_.!~*'()[]%"
  1488. or ((self.scanner_processing_version > (1, 1)) and ch == '#')
  1489. ):
  1490. if ch == '%':
  1491. chunks.append(self.reader.prefix(length))
  1492. self.reader.forward(length)
  1493. length = 0
  1494. chunks.append(self.scan_uri_escapes(name, start_mark))
  1495. else:
  1496. length += 1
  1497. ch = srp(length)
  1498. if length != 0:
  1499. chunks.append(self.reader.prefix(length))
  1500. self.reader.forward(length)
  1501. length = 0
  1502. if not chunks:
  1503. raise ScannerError(
  1504. f'while parsing an {name!s}',
  1505. start_mark,
  1506. f'expected URI, but found {ch!r}',
  1507. self.reader.get_mark(),
  1508. )
  1509. return "".join(chunks)
  1510. def scan_uri_escapes(self, name: Any, start_mark: Any) -> Any:
  1511. # See the specification for details.
  1512. srp = self.reader.peek
  1513. srf = self.reader.forward
  1514. code_bytes: List[Any] = []
  1515. mark = self.reader.get_mark()
  1516. while srp() == '%':
  1517. srf()
  1518. for k in range(2):
  1519. if srp(k) not in '0123456789ABCDEFabcdef':
  1520. raise ScannerError(
  1521. f'while scanning an {name!s}',
  1522. start_mark,
  1523. f'expected URI escape sequence of 2 hexdecimal numbers, '
  1524. f'but found {srp(k)!r}',
  1525. self.reader.get_mark(),
  1526. )
  1527. code_bytes.append(int(self.reader.prefix(2), 16))
  1528. srf(2)
  1529. try:
  1530. value = bytes(code_bytes).decode('utf-8')
  1531. except UnicodeDecodeError as exc:
  1532. raise ScannerError(f'while scanning an {name!s}', start_mark, str(exc), mark)
  1533. return value
  1534. def scan_line_break(self) -> Any:
  1535. # Transforms:
  1536. # '\r\n' : '\n'
  1537. # '\r' : '\n'
  1538. # '\n' : '\n'
  1539. # '\x85' : '\n'
  1540. # '\u2028' : '\u2028'
  1541. # '\u2029 : '\u2029'
  1542. # default : ''
  1543. ch = self.reader.peek()
  1544. if ch in '\r\n\x85':
  1545. if self.reader.prefix(2) == '\r\n':
  1546. self.reader.forward(2)
  1547. else:
  1548. self.reader.forward()
  1549. return '\n'
  1550. elif ch in '\u2028\u2029':
  1551. self.reader.forward()
  1552. return ch
  1553. return ""
  1554. class RoundTripScanner(Scanner):
  1555. def check_token(self, *choices: Any) -> bool:
  1556. # Check if the next token is one of the given types.
  1557. while self.need_more_tokens():
  1558. self.fetch_more_tokens()
  1559. self._gather_comments()
  1560. if len(self.tokens) > 0:
  1561. if not choices:
  1562. return True
  1563. for choice in choices:
  1564. if isinstance(self.tokens[0], choice):
  1565. return True
  1566. return False
  1567. def peek_token(self) -> Any:
  1568. # Return the next token, but do not delete if from the queue.
  1569. while self.need_more_tokens():
  1570. self.fetch_more_tokens()
  1571. self._gather_comments()
  1572. if len(self.tokens) > 0:
  1573. return self.tokens[0]
  1574. return None
  1575. def _gather_comments(self) -> Any:
  1576. """combine multiple comment lines and assign to next non-comment-token"""
  1577. comments: List[Any] = []
  1578. if not self.tokens:
  1579. return comments
  1580. if isinstance(self.tokens[0], CommentToken):
  1581. comment = self.tokens.pop(0)
  1582. self.tokens_taken += 1
  1583. comments.append(comment)
  1584. while self.need_more_tokens():
  1585. self.fetch_more_tokens()
  1586. if not self.tokens:
  1587. return comments
  1588. if isinstance(self.tokens[0], CommentToken):
  1589. self.tokens_taken += 1
  1590. comment = self.tokens.pop(0)
  1591. # nprint('dropping2', comment)
  1592. comments.append(comment)
  1593. if len(comments) >= 1:
  1594. self.tokens[0].add_pre_comments(comments)
  1595. # pull in post comment on e.g. ':'
  1596. if not self.done and len(self.tokens) < 2:
  1597. self.fetch_more_tokens()
  1598. def get_token(self) -> Any:
  1599. # Return the next token.
  1600. while self.need_more_tokens():
  1601. self.fetch_more_tokens()
  1602. self._gather_comments()
  1603. if len(self.tokens) > 0:
  1604. # nprint('tk', self.tokens)
  1605. # only add post comment to single line tokens:
  1606. # scalar, value token. FlowXEndToken, otherwise
  1607. # hidden streamtokens could get them (leave them and they will be
  1608. # pre comments for the next map/seq
  1609. if (
  1610. len(self.tokens) > 1
  1611. and isinstance(
  1612. self.tokens[0],
  1613. (ScalarToken, ValueToken, FlowSequenceEndToken, FlowMappingEndToken),
  1614. )
  1615. and isinstance(self.tokens[1], CommentToken)
  1616. and self.tokens[0].end_mark.line == self.tokens[1].start_mark.line
  1617. ):
  1618. self.tokens_taken += 1
  1619. c = self.tokens.pop(1)
  1620. self.fetch_more_tokens()
  1621. while len(self.tokens) > 1 and isinstance(self.tokens[1], CommentToken):
  1622. self.tokens_taken += 1
  1623. c1 = self.tokens.pop(1)
  1624. c.value = c.value + (' ' * c1.start_mark.column) + c1.value
  1625. self.fetch_more_tokens()
  1626. self.tokens[0].add_post_comment(c)
  1627. elif (
  1628. len(self.tokens) > 1
  1629. and isinstance(self.tokens[0], ScalarToken)
  1630. and isinstance(self.tokens[1], CommentToken)
  1631. and self.tokens[0].end_mark.line != self.tokens[1].start_mark.line
  1632. ):
  1633. self.tokens_taken += 1
  1634. c = self.tokens.pop(1)
  1635. c.value = (
  1636. '\n' * (c.start_mark.line - self.tokens[0].end_mark.line)
  1637. + (' ' * c.start_mark.column)
  1638. + c.value
  1639. )
  1640. self.tokens[0].add_post_comment(c)
  1641. self.fetch_more_tokens()
  1642. while len(self.tokens) > 1 and isinstance(self.tokens[1], CommentToken):
  1643. self.tokens_taken += 1
  1644. c1 = self.tokens.pop(1)
  1645. c.value = c.value + (' ' * c1.start_mark.column) + c1.value
  1646. self.fetch_more_tokens()
  1647. self.tokens_taken += 1
  1648. return self.tokens.pop(0)
  1649. return None
  1650. def fetch_comment(self, comment: Any) -> None:
  1651. value, start_mark, end_mark = comment
  1652. while value and value[-1] == ' ':
  1653. # empty line within indented key context
  1654. # no need to update end-mark, that is not used
  1655. value = value[:-1]
  1656. self.tokens.append(CommentToken(value, start_mark, end_mark))
  1657. # scanner
  1658. def scan_to_next_token(self) -> Any:
  1659. # We ignore spaces, line breaks and comments.
  1660. # If we find a line break in the block context, we set the flag
  1661. # `allow_simple_key` on.
  1662. # The byte order mark is stripped if it's the first character in the
  1663. # stream. We do not yet support BOM inside the stream as the
  1664. # specification requires. Any such mark will be considered as a part
  1665. # of the document.
  1666. #
  1667. # TODO: We need to make tab handling rules more sane. A good rule is
  1668. # Tabs cannot precede tokens
  1669. # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
  1670. # KEY(block), VALUE(block), BLOCK-ENTRY
  1671. # So the checking code is
  1672. # if <TAB>:
  1673. # self.allow_simple_keys = False
  1674. # We also need to add the check for `allow_simple_keys == True` to
  1675. # `unwind_indent` before issuing BLOCK-END.
  1676. # Scanners for block, flow, and plain scalars need to be modified.
  1677. srp = self.reader.peek
  1678. srf = self.reader.forward
  1679. if self.reader.index == 0 and srp() == '\uFEFF':
  1680. srf()
  1681. found = False
  1682. white_space = ' \t' if self.flow_level > 0 else ' '
  1683. while not found:
  1684. while srp() in white_space:
  1685. srf()
  1686. ch = srp()
  1687. if ch == '#':
  1688. start_mark = self.reader.get_mark()
  1689. comment = ch
  1690. srf()
  1691. while ch not in _THE_END:
  1692. ch = srp()
  1693. if ch == '\0': # don't gobble the end-of-stream character
  1694. # but add an explicit newline as "YAML processors should terminate
  1695. # the stream with an explicit line break
  1696. # https://yaml.org/spec/1.2/spec.html#id2780069
  1697. comment += '\n'
  1698. break
  1699. comment += ch
  1700. srf()
  1701. # gather any blank lines following the comment
  1702. ch = self.scan_line_break()
  1703. while len(ch) > 0:
  1704. comment += ch
  1705. ch = self.scan_line_break()
  1706. end_mark = self.reader.get_mark()
  1707. if not self.flow_level:
  1708. self.allow_simple_key = True
  1709. return comment, start_mark, end_mark
  1710. if self.scan_line_break() != '':
  1711. start_mark = self.reader.get_mark()
  1712. if not self.flow_level:
  1713. self.allow_simple_key = True
  1714. ch = srp()
  1715. if ch == '\n': # empty toplevel lines
  1716. start_mark = self.reader.get_mark()
  1717. comment = ""
  1718. while ch:
  1719. ch = self.scan_line_break(empty_line=True)
  1720. comment += ch
  1721. if srp() == '#':
  1722. # empty line followed by indented real comment
  1723. comment = comment.rsplit('\n', 1)[0] + '\n'
  1724. end_mark = self.reader.get_mark()
  1725. return comment, start_mark, end_mark
  1726. else:
  1727. found = True
  1728. return None
  1729. def scan_line_break(self, empty_line: bool = False) -> Text:
  1730. # Transforms:
  1731. # '\r\n' : '\n'
  1732. # '\r' : '\n'
  1733. # '\n' : '\n'
  1734. # '\x85' : '\n'
  1735. # '\u2028' : '\u2028'
  1736. # '\u2029 : '\u2029'
  1737. # default : ''
  1738. ch: Text = self.reader.peek()
  1739. if ch in '\r\n\x85':
  1740. if self.reader.prefix(2) == '\r\n':
  1741. self.reader.forward(2)
  1742. else:
  1743. self.reader.forward()
  1744. return '\n'
  1745. elif ch in '\u2028\u2029':
  1746. self.reader.forward()
  1747. return ch
  1748. elif empty_line and ch in '\t ':
  1749. self.reader.forward()
  1750. return ch
  1751. return ""
  1752. def scan_block_scalar(self, style: Any, rt: Optional[bool] = True) -> Any:
  1753. return Scanner.scan_block_scalar(self, style, rt=rt)
  1754. def scan_uri_escapes(self, name: Any, start_mark: Any) -> Any:
  1755. """
  1756. The roundtripscanner doesn't do URI escaping
  1757. """
  1758. # See the specification for details.
  1759. srp = self.reader.peek
  1760. srf = self.reader.forward
  1761. code_bytes: List[Any] = []
  1762. chunk = ''
  1763. mark = self.reader.get_mark()
  1764. while srp() == '%':
  1765. chunk += '%'
  1766. srf()
  1767. for k in range(2):
  1768. if srp(k) not in '0123456789ABCDEFabcdef':
  1769. raise ScannerError(
  1770. f'while scanning an {name!s}',
  1771. start_mark,
  1772. f'expected URI escape sequence of 2 hexdecimal numbers, '
  1773. f'but found {srp(k)!r}',
  1774. self.reader.get_mark(),
  1775. )
  1776. code_bytes.append(int(self.reader.prefix(2), 16))
  1777. chunk += self.reader.prefix(2)
  1778. srf(2)
  1779. try:
  1780. _ = bytes(code_bytes).decode('utf-8')
  1781. except UnicodeDecodeError as exc:
  1782. raise ScannerError(f'while scanning an {name!s}', start_mark, str(exc), mark)
  1783. return chunk
  1784. # commenthandling 2021, differentiatiation not needed
  1785. VALUECMNT = 0
  1786. KEYCMNT = 0 # 1
  1787. # TAGCMNT = 2
  1788. # ANCHORCMNT = 3
  1789. class CommentBase:
  1790. __slots__ = ('value', 'line', 'column', 'used', 'function', 'fline', 'ufun', 'uline')
  1791. def __init__(self, value: Any, line: Any, column: Any) -> None:
  1792. self.value = value
  1793. self.line = line
  1794. self.column = column
  1795. self.used = ' '
  1796. if _debug != 0:
  1797. import inspect
  1798. info = inspect.getframeinfo(inspect.stack()[3][0])
  1799. self.function = info.function
  1800. self.fline = info.lineno
  1801. self.ufun = None
  1802. self.uline = None
  1803. def set_used(self, v: Any = '+') -> None:
  1804. self.used = v
  1805. if _debug != 0:
  1806. import inspect
  1807. info = inspect.getframeinfo(inspect.stack()[1][0])
  1808. self.ufun = info.function # type: ignore
  1809. self.uline = info.lineno # type: ignore
  1810. def set_assigned(self) -> None:
  1811. self.used = '|'
  1812. def __str__(self) -> str:
  1813. return f'{self.value}'
  1814. def __repr__(self) -> str:
  1815. return f'{self.value!r}'
  1816. def info(self) -> str:
  1817. xv = self.value + '"'
  1818. name = self.name # type: ignore
  1819. return (
  1820. f'{name}{self.used} {self.line:2}:{self.column:<2} "{xv:40s} '
  1821. f'{self.function}:{self.fline} {self.ufun}:{self.uline}'
  1822. )
  1823. class EOLComment(CommentBase):
  1824. name = 'EOLC'
  1825. def __init__(self, value: Any, line: Any, column: Any) -> None:
  1826. super().__init__(value, line, column)
  1827. class FullLineComment(CommentBase):
  1828. name = 'FULL'
  1829. def __init__(self, value: Any, line: Any, column: Any) -> None:
  1830. super().__init__(value, line, column)
  1831. class BlankLineComment(CommentBase):
  1832. name = 'BLNK'
  1833. def __init__(self, value: Any, line: Any, column: Any) -> None:
  1834. super().__init__(value, line, column)
  1835. class ScannedComments:
  1836. def __init__(self: Any) -> None:
  1837. self.comments = {} # type: ignore
  1838. self.unused = [] # type: ignore
  1839. def add_eol_comment(self, comment: Any, column: Any, line: Any) -> Any:
  1840. # info = inspect.getframeinfo(inspect.stack()[1][0])
  1841. if comment.count('\n') == 1:
  1842. assert comment[-1] == '\n'
  1843. else:
  1844. assert '\n' not in comment
  1845. self.comments[line] = retval = EOLComment(comment[:-1], line, column)
  1846. self.unused.append(line)
  1847. return retval
  1848. def add_blank_line(self, comment: Any, column: Any, line: Any) -> Any:
  1849. # info = inspect.getframeinfo(inspect.stack()[1][0])
  1850. assert comment.count('\n') == 1 and comment[-1] == '\n'
  1851. assert line not in self.comments
  1852. self.comments[line] = retval = BlankLineComment(comment[:-1], line, column)
  1853. self.unused.append(line)
  1854. return retval
  1855. def add_full_line_comment(self, comment: Any, column: Any, line: Any) -> Any:
  1856. # info = inspect.getframeinfo(inspect.stack()[1][0])
  1857. assert comment.count('\n') == 1 and comment[-1] == '\n'
  1858. # if comment.startswith('# C12'):
  1859. # raise
  1860. # this raises in line 2127 fro 330
  1861. self.comments[line] = retval = FullLineComment(comment[:-1], line, column)
  1862. self.unused.append(line)
  1863. return retval
  1864. def __getitem__(self, idx: Any) -> Any:
  1865. return self.comments[idx]
  1866. def __str__(self) -> Any:
  1867. return (
  1868. 'ParsedComments:\n '
  1869. + '\n '.join((f'{lineno:2} {x.info()}' for lineno, x in self.comments.items()))
  1870. + '\n'
  1871. )
  1872. def last(self) -> str:
  1873. lineno, x = list(self.comments.items())[-1]
  1874. return f'{lineno:2} {x.info()}\n'
  1875. def any_unprocessed(self) -> bool:
  1876. # ToDo: might want to differentiate based on lineno
  1877. return len(self.unused) > 0
  1878. # for lno, comment in reversed(self.comments.items()):
  1879. # if comment.used == ' ':
  1880. # return True
  1881. # return False
  1882. def unprocessed(self, use: Any = False) -> Any:
  1883. while len(self.unused) > 0:
  1884. if _debug != 0:
  1885. import inspect
  1886. first = self.unused.pop(0) if use else self.unused[0]
  1887. info = inspect.getframeinfo(inspect.stack()[1][0])
  1888. xprintf('using', first, self.comments[first].value, info.function, info.lineno)
  1889. yield first, self.comments[first]
  1890. if use:
  1891. self.comments[first].set_used()
  1892. def assign_pre(self, token: Any) -> Any:
  1893. token_line = token.start_mark.line
  1894. if _debug != 0:
  1895. import inspect
  1896. info = inspect.getframeinfo(inspect.stack()[1][0])
  1897. xprintf('assign_pre', token_line, self.unused, info.function, info.lineno)
  1898. gobbled = False
  1899. while self.unused and self.unused[0] < token_line:
  1900. gobbled = True
  1901. first = self.unused.pop(0)
  1902. if _debug != 0:
  1903. xprintf('assign_pre < ', first)
  1904. self.comments[first].set_used()
  1905. token.add_comment_pre(first)
  1906. return gobbled
  1907. def assign_eol(self, tokens: Any) -> Any:
  1908. try:
  1909. comment_line = self.unused[0]
  1910. except IndexError:
  1911. return
  1912. if not isinstance(self.comments[comment_line], EOLComment):
  1913. return
  1914. idx = 1
  1915. while tokens[-idx].start_mark.line > comment_line or isinstance(
  1916. tokens[-idx], ValueToken,
  1917. ):
  1918. idx += 1
  1919. if _debug != 0:
  1920. xprintf('idx1', idx)
  1921. if (
  1922. len(tokens) > idx
  1923. and isinstance(tokens[-idx], ScalarToken)
  1924. and isinstance(tokens[-(idx + 1)], ScalarToken)
  1925. ):
  1926. return
  1927. try:
  1928. if isinstance(tokens[-idx], ScalarToken) and isinstance(
  1929. tokens[-(idx + 1)], KeyToken,
  1930. ):
  1931. try:
  1932. eol_idx = self.unused.pop(0)
  1933. self.comments[eol_idx].set_used()
  1934. if _debug != 0:
  1935. xprintf('>>>>>a', idx, eol_idx, KEYCMNT)
  1936. tokens[-idx].add_comment_eol(eol_idx, KEYCMNT)
  1937. except IndexError:
  1938. raise NotImplementedError
  1939. return
  1940. except IndexError:
  1941. if _debug != 0:
  1942. xprintf('IndexError1')
  1943. pass
  1944. try:
  1945. if isinstance(tokens[-idx], ScalarToken) and isinstance(
  1946. tokens[-(idx + 1)], (ValueToken, BlockEntryToken),
  1947. ):
  1948. try:
  1949. eol_idx = self.unused.pop(0)
  1950. self.comments[eol_idx].set_used()
  1951. tokens[-idx].add_comment_eol(eol_idx, VALUECMNT)
  1952. except IndexError:
  1953. raise NotImplementedError
  1954. return
  1955. except IndexError:
  1956. if _debug != 0:
  1957. xprintf('IndexError2')
  1958. pass
  1959. for t in tokens:
  1960. xprintf('tt-', t)
  1961. if _debug != 0:
  1962. xprintf('not implemented EOL', type(tokens[-idx]))
  1963. import sys
  1964. sys.exit(0)
  1965. def assign_post(self, token: Any) -> Any:
  1966. token_line = token.start_mark.line
  1967. if _debug != 0:
  1968. import inspect
  1969. info = inspect.getframeinfo(inspect.stack()[1][0])
  1970. xprintf('assign_post', token_line, self.unused, info.function, info.lineno)
  1971. gobbled = False
  1972. while self.unused and self.unused[0] < token_line:
  1973. gobbled = True
  1974. first = self.unused.pop(0)
  1975. if _debug != 0:
  1976. xprintf('assign_post < ', first)
  1977. self.comments[first].set_used()
  1978. token.add_comment_post(first)
  1979. return gobbled
  1980. def str_unprocessed(self) -> Any:
  1981. return ''.join(
  1982. (f' {ind:2} {x.info()}\n' for ind, x in self.comments.items() if x.used == ' '),
  1983. )
  1984. class RoundTripScannerSC(Scanner): # RoundTripScanner Split Comments
  1985. def __init__(self, *arg: Any, **kw: Any) -> None:
  1986. super().__init__(*arg, **kw)
  1987. assert self.loader is not None
  1988. # comments isinitialised on .need_more_tokens and persist on
  1989. # self.loader.parsed_comments
  1990. self.comments = None
  1991. def get_token(self) -> Any:
  1992. # Return the next token.
  1993. while self.need_more_tokens():
  1994. self.fetch_more_tokens()
  1995. if len(self.tokens) > 0:
  1996. if isinstance(self.tokens[0], BlockEndToken):
  1997. self.comments.assign_post(self.tokens[0]) # type: ignore
  1998. else:
  1999. self.comments.assign_pre(self.tokens[0]) # type: ignore
  2000. self.tokens_taken += 1
  2001. return self.tokens.pop(0)
  2002. def need_more_tokens(self) -> bool:
  2003. if self.comments is None:
  2004. self.loader.parsed_comments = self.comments = ScannedComments() # type: ignore
  2005. if self.done:
  2006. return False
  2007. if len(self.tokens) == 0:
  2008. return True
  2009. # The current token may be a potential simple key, so we
  2010. # need to look further.
  2011. self.stale_possible_simple_keys()
  2012. if self.next_possible_simple_key() == self.tokens_taken:
  2013. return True
  2014. if len(self.tokens) < 2:
  2015. return True
  2016. if self.tokens[0].start_mark.line == self.tokens[-1].start_mark.line:
  2017. return True
  2018. if True:
  2019. if _debug != 0:
  2020. xprintf('-x--', len(self.tokens))
  2021. for t in self.tokens:
  2022. xprintf(t)
  2023. # xprintf(self.comments.last())
  2024. xprintf(self.comments.str_unprocessed()) # type: ignore
  2025. self.comments.assign_pre(self.tokens[0]) # type: ignore
  2026. self.comments.assign_eol(self.tokens) # type: ignore
  2027. return False
  2028. def scan_to_next_token(self) -> None:
  2029. srp = self.reader.peek
  2030. srf = self.reader.forward
  2031. if self.reader.index == 0 and srp() == '\uFEFF':
  2032. srf()
  2033. start_mark = self.reader.get_mark()
  2034. # xprintf('current_mark', start_mark.line, start_mark.column)
  2035. found = False
  2036. while not found:
  2037. while srp() == ' ':
  2038. srf()
  2039. ch = srp()
  2040. if ch == '#':
  2041. comment_start_mark = self.reader.get_mark()
  2042. comment = ch
  2043. srf() # skipt the '#'
  2044. while ch not in _THE_END:
  2045. ch = srp()
  2046. if ch == '\0': # don't gobble the end-of-stream character
  2047. # but add an explicit newline as "YAML processors should terminate
  2048. # the stream with an explicit line break
  2049. # https://yaml.org/spec/1.2/spec.html#id2780069
  2050. comment += '\n'
  2051. break
  2052. comment += ch
  2053. srf()
  2054. # we have a comment
  2055. if start_mark.column == 0:
  2056. self.comments.add_full_line_comment( # type: ignore
  2057. comment, comment_start_mark.column, comment_start_mark.line,
  2058. )
  2059. else:
  2060. self.comments.add_eol_comment( # type: ignore
  2061. comment, comment_start_mark.column, comment_start_mark.line,
  2062. )
  2063. comment = ""
  2064. # gather any blank lines or full line comments following the comment as well
  2065. self.scan_empty_or_full_line_comments()
  2066. if not self.flow_level:
  2067. self.allow_simple_key = True
  2068. return
  2069. if bool(self.scan_line_break()):
  2070. # start_mark = self.reader.get_mark()
  2071. if not self.flow_level:
  2072. self.allow_simple_key = True
  2073. self.scan_empty_or_full_line_comments()
  2074. return None
  2075. ch = srp()
  2076. if ch == '\n': # empty toplevel lines
  2077. start_mark = self.reader.get_mark()
  2078. comment = ""
  2079. while ch:
  2080. ch = self.scan_line_break(empty_line=True)
  2081. comment += ch
  2082. if srp() == '#':
  2083. # empty line followed by indented real comment
  2084. comment = comment.rsplit('\n', 1)[0] + '\n'
  2085. _ = self.reader.get_mark() # gobble end_mark
  2086. return None
  2087. else:
  2088. found = True
  2089. return None
  2090. def scan_empty_or_full_line_comments(self) -> None:
  2091. blmark = self.reader.get_mark()
  2092. assert blmark.column == 0
  2093. blanks = ""
  2094. comment = None
  2095. mark = None
  2096. ch = self.reader.peek()
  2097. while True:
  2098. # nprint('ch', repr(ch), self.reader.get_mark().column)
  2099. if ch in '\r\n\x85\u2028\u2029':
  2100. if self.reader.prefix(2) == '\r\n':
  2101. self.reader.forward(2)
  2102. else:
  2103. self.reader.forward()
  2104. if comment is not None:
  2105. comment += '\n'
  2106. self.comments.add_full_line_comment(comment, mark.column, mark.line)
  2107. comment = None
  2108. else:
  2109. blanks += '\n'
  2110. self.comments.add_blank_line(blanks, blmark.column, blmark.line) # type: ignore # NOQA
  2111. blanks = ""
  2112. blmark = self.reader.get_mark()
  2113. ch = self.reader.peek()
  2114. continue
  2115. if comment is None:
  2116. if ch in ' \t':
  2117. blanks += ch
  2118. elif ch == '#':
  2119. mark = self.reader.get_mark()
  2120. comment = '#'
  2121. else:
  2122. # xprintf('breaking on', repr(ch))
  2123. break
  2124. else:
  2125. comment += ch
  2126. self.reader.forward()
  2127. ch = self.reader.peek()
  2128. def scan_block_scalar_ignored_line(self, start_mark: Any) -> Any:
  2129. # See the specification for details.
  2130. srp = self.reader.peek
  2131. srf = self.reader.forward
  2132. prefix = ''
  2133. comment = None
  2134. while srp() == ' ':
  2135. prefix += srp()
  2136. srf()
  2137. if srp() == '#':
  2138. comment = ''
  2139. mark = self.reader.get_mark()
  2140. while srp() not in _THE_END:
  2141. comment += srp()
  2142. srf()
  2143. comment += '\n' # type: ignore
  2144. ch = srp()
  2145. if ch not in _THE_END:
  2146. raise ScannerError(
  2147. 'while scanning a block scalar',
  2148. start_mark,
  2149. f'expected a comment or a line break, but found {ch!r}',
  2150. self.reader.get_mark(),
  2151. )
  2152. if comment is not None:
  2153. self.comments.add_eol_comment(comment, mark.column, mark.line) # type: ignore
  2154. self.scan_line_break()
  2155. return None