scanner.py 87 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390
  1. from __future__ import annotations
  2. # Scanner produces tokens of the following types:
  3. # STREAM-START
  4. # STREAM-END
  5. # DIRECTIVE(name, value)
  6. # DOCUMENT-START
  7. # DOCUMENT-END
  8. # BLOCK-SEQUENCE-START
  9. # BLOCK-MAPPING-START
  10. # BLOCK-END
  11. # FLOW-SEQUENCE-START
  12. # FLOW-MAPPING-START
  13. # FLOW-SEQUENCE-END
  14. # FLOW-MAPPING-END
  15. # BLOCK-ENTRY
  16. # FLOW-ENTRY
  17. # KEY
  18. # VALUE
  19. # ALIAS(value)
  20. # ANCHOR(value)
  21. # TAG(value)
  22. # SCALAR(value, plain, style)
  23. #
  24. # RoundTripScanner
  25. # COMMENT(value)
  26. #
  27. # Read comments in the Scanner code for more details.
  28. #
  29. from ruamel.yaml.error import MarkedYAMLError
  30. import ruamel.yaml.tokens as tokens
  31. from ruamel.yaml.docinfo import Version # NOQA
  32. from ruamel.yaml.compat import check_anchorname_char, _debug, nprint, nprintf # NOQA
  33. if False: # MYPY
  34. from typing import Any, Dict, Optional, List, Union, Text, Tuple # NOQA
  35. __all__ = ['Scanner', 'RoundTripScanner', 'ScannerError']
  36. _THE_END = '\n\0\r\x85\u2028\u2029'
  37. _THE_END_SPACE_TAB = ' \n\0\t\r\x85\u2028\u2029'
  38. _SPACE_TAB = ' \t'
  39. if _debug != 0:
  40. def xprintf(*args: Any, **kw: Any) -> Any:
  41. return nprintf(*args, **kw)
  42. class ScannerError(MarkedYAMLError):
  43. pass
  44. class SimpleKey:
  45. # See below simple keys treatment.
  46. def __init__(
  47. self, token_number: Any, required: Any, index: int, line: int, column: int, mark: Any,
  48. ) -> None:
  49. self.token_number = token_number
  50. self.required = required
  51. self.index = index
  52. self.line = line
  53. self.column = column
  54. self.mark = mark
  55. class Scanner:
  56. def __init__(self, loader: Any = None) -> None:
  57. """Initialize the scanner."""
  58. # It is assumed that Scanner and Reader will have a common descendant.
  59. # Reader do the dirty work of checking for BOM and converting the
  60. # input data to Unicode. It also adds NUL to the end.
  61. #
  62. # Reader supports the following methods
  63. # self.peek(i=0) # peek the next i-th character
  64. # self.prefix(l=1) # peek the next l characters
  65. # self.forward(l=1) # read the next l characters and move the pointer
  66. self.loader = loader
  67. if self.loader is not None and getattr(self.loader, '_scanner', None) is None:
  68. self.loader._scanner = self
  69. self.reset_scanner()
  70. self.first_time = False
  71. @property
  72. def flow_level(self) -> int:
  73. return len(self.flow_context)
  74. def reset_scanner(self) -> None:
  75. # Had we reached the end of the stream?
  76. self.done = False
  77. # flow_context is an expanding/shrinking list consisting of '{' and '['
  78. # for each unclosed flow context. If empty list that means block context
  79. self.flow_context: List[Text] = []
  80. # List of processed tokens that are not yet emitted.
  81. self.tokens: List[Any] = []
  82. # Add the STREAM-START token.
  83. self.fetch_stream_start()
  84. # Number of tokens that were emitted through the `get_token` method.
  85. self.tokens_taken = 0
  86. # The current indentation level.
  87. self.indent = -1
  88. # Past indentation levels.
  89. self.indents: List[int] = []
  90. # Variables related to simple keys treatment.
  91. # A simple key is a key that is not denoted by the '?' indicator.
  92. # Example of simple keys:
  93. # ---
  94. # block simple key: value
  95. # ? not a simple key:
  96. # : { flow simple key: value }
  97. # We emit the KEY token before all keys, so when we find a potential
  98. # simple key, we try to locate the corresponding ':' indicator.
  99. # Simple keys should be limited to a single line and 1024 characters.
  100. # Can a simple key start at the current position? A simple key may
  101. # start:
  102. # - at the beginning of the line, not counting indentation spaces
  103. # (in block context),
  104. # - after '{', '[', ',' (in the flow context),
  105. # - after '?', ':', '-' (in the block context).
  106. # In the block context, this flag also signifies if a block collection
  107. # may start at the current position.
  108. self.allow_simple_key = True
  109. # Keep track of possible simple keys. This is a dictionary. The key
  110. # is `flow_level`; there can be no more that one possible simple key
  111. # for each level. The value is a SimpleKey record:
  112. # (token_number, required, index, line, column, mark)
  113. # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
  114. # '[', or '{' tokens.
  115. self.possible_simple_keys: Dict[Any, Any] = {}
  116. self.yaml_version: Any = None
  117. self.tag_directives: List[Tuple[Any, Any]] = []
  118. @property
  119. def reader(self) -> Any:
  120. try:
  121. return self._scanner_reader # type: ignore
  122. except AttributeError:
  123. if hasattr(self.loader, 'typ'):
  124. self._scanner_reader = self.loader.reader
  125. else:
  126. self._scanner_reader = self.loader._reader
  127. return self._scanner_reader
  128. @property
  129. def scanner_processing_version(self) -> Any: # prefix until un-composited
  130. if hasattr(self.loader, 'typ'):
  131. return self.loader.resolver.processing_version
  132. return self.loader.processing_version
  133. # Public methods.
  134. def check_token(self, *choices: Any) -> bool:
  135. # Check if the next token is one of the given types.
  136. while self.need_more_tokens():
  137. self.fetch_more_tokens()
  138. if len(self.tokens) > 0:
  139. if not choices:
  140. return True
  141. for choice in choices:
  142. if isinstance(self.tokens[0], choice):
  143. return True
  144. return False
  145. def peek_token(self) -> Any:
  146. # Return the next token, but do not delete if from the queue.
  147. while self.need_more_tokens():
  148. self.fetch_more_tokens()
  149. if len(self.tokens) > 0:
  150. return self.tokens[0]
  151. def get_token(self) -> Any:
  152. # Return the next token.
  153. while self.need_more_tokens():
  154. self.fetch_more_tokens()
  155. if len(self.tokens) > 0:
  156. self.tokens_taken += 1
  157. return self.tokens.pop(0)
  158. # Private methods.
  159. def need_more_tokens(self) -> bool:
  160. if self.done:
  161. return False
  162. if len(self.tokens) == 0:
  163. return True
  164. # The current token may be a potential simple key, so we
  165. # need to look further.
  166. self.stale_possible_simple_keys()
  167. if self.next_possible_simple_key() == self.tokens_taken:
  168. return True
  169. return False
  170. def fetch_comment(self, comment: Any) -> None:
  171. raise NotImplementedError
  172. def fetch_more_tokens(self) -> Any:
  173. # Eat whitespaces and comments until we reach the next token.
  174. comment = self.scan_to_next_token()
  175. if comment is not None: # never happens for base scanner
  176. return self.fetch_comment(comment)
  177. # Remove obsolete possible simple keys.
  178. self.stale_possible_simple_keys()
  179. # Compare the current indentation and column. It may add some tokens
  180. # and decrease the current indentation level.
  181. self.unwind_indent(self.reader.column)
  182. # Peek the next character.
  183. ch = self.reader.peek()
  184. # Is it the end of stream?
  185. if ch == '\0':
  186. return self.fetch_stream_end()
  187. # Is it a directive?
  188. if ch == '%' and self.check_directive():
  189. return self.fetch_directive()
  190. # Is it the document start?
  191. if ch == '-' and self.check_document_start():
  192. return self.fetch_document_start()
  193. # Is it the document end?
  194. if ch == '.' and self.check_document_end():
  195. return self.fetch_document_end()
  196. # TODO: support for BOM within a stream.
  197. # if ch == '\uFEFF':
  198. # return self.fetch_bom() <-- issue BOMToken
  199. # Note: the order of the following checks is NOT significant.
  200. # Is it the flow sequence start indicator?
  201. if ch == '[':
  202. return self.fetch_flow_sequence_start()
  203. # Is it the flow mapping start indicator?
  204. if ch == '{':
  205. return self.fetch_flow_mapping_start()
  206. # Is it the flow sequence end indicator?
  207. if ch == ']':
  208. return self.fetch_flow_sequence_end()
  209. # Is it the flow mapping end indicator?
  210. if ch == '}':
  211. return self.fetch_flow_mapping_end()
  212. # Is it the flow entry indicator?
  213. if ch == ',':
  214. return self.fetch_flow_entry()
  215. # Is it the block entry indicator?
  216. if ch == '-' and self.check_block_entry():
  217. return self.fetch_block_entry()
  218. # Is it the key indicator?
  219. if ch == '?' and self.check_key():
  220. return self.fetch_key()
  221. # Is it the value indicator?
  222. if ch == ':' and self.check_value():
  223. return self.fetch_value()
  224. # Is it an alias?
  225. if ch == '*':
  226. return self.fetch_alias()
  227. # Is it an anchor?
  228. if ch == '&':
  229. return self.fetch_anchor()
  230. # Is it a tag?
  231. if ch == '!':
  232. return self.fetch_tag()
  233. # Is it a literal scalar?
  234. if ch == '|' and not self.flow_level:
  235. return self.fetch_literal()
  236. # Is it a folded scalar?
  237. if ch == '>' and not self.flow_level:
  238. return self.fetch_folded()
  239. # Is it a single quoted scalar?
  240. if ch == "'":
  241. return self.fetch_single()
  242. # Is it a double quoted scalar?
  243. if ch == '"':
  244. return self.fetch_double()
  245. # It must be a plain scalar then.
  246. if self.check_plain():
  247. return self.fetch_plain()
  248. # No? It's an error. Let's produce a nice error message.
  249. raise ScannerError(
  250. 'while scanning for the next token',
  251. None,
  252. f'found character {ch!r} that cannot start any token',
  253. self.reader.get_mark(),
  254. )
  255. # Simple keys treatment.
  256. def next_possible_simple_key(self) -> Any:
  257. # Return the number of the nearest possible simple key. Actually we
  258. # don't need to loop through the whole dictionary. We may replace it
  259. # with the following code:
  260. # if not self.possible_simple_keys:
  261. # return None
  262. # return self.possible_simple_keys[
  263. # min(self.possible_simple_keys.keys())].token_number
  264. min_token_number = None
  265. for level in self.possible_simple_keys:
  266. key = self.possible_simple_keys[level]
  267. if min_token_number is None or key.token_number < min_token_number:
  268. min_token_number = key.token_number
  269. return min_token_number
  270. def stale_possible_simple_keys(self) -> None:
  271. # Remove entries that are no longer possible simple keys. According to
  272. # the YAML specification, simple keys
  273. # - should be limited to a single line,
  274. # - should be no longer than 1024 characters.
  275. # Disabling this procedure will allow simple keys of any length and
  276. # height (may cause problems if indentation is broken though).
  277. for level in list(self.possible_simple_keys):
  278. key = self.possible_simple_keys[level]
  279. if key.line != self.reader.line or self.reader.index - key.index > 1024:
  280. if key.required:
  281. raise ScannerError(
  282. 'while scanning a simple key',
  283. key.mark,
  284. "could not find expected ':'",
  285. self.reader.get_mark(),
  286. )
  287. del self.possible_simple_keys[level]
  288. def save_possible_simple_key(self) -> None:
  289. # The next token may start a simple key. We check if it's possible
  290. # and save its position. This function is called for
  291. # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
  292. # Check if a simple key is required at the current position.
  293. required = not self.flow_level and self.indent == self.reader.column
  294. # The next token might be a simple key. Let's save it's number and
  295. # position.
  296. if self.allow_simple_key:
  297. self.remove_possible_simple_key()
  298. token_number = self.tokens_taken + len(self.tokens)
  299. key = SimpleKey(
  300. token_number,
  301. required,
  302. self.reader.index,
  303. self.reader.line,
  304. self.reader.column,
  305. self.reader.get_mark(),
  306. )
  307. self.possible_simple_keys[self.flow_level] = key
  308. def remove_possible_simple_key(self) -> None:
  309. # Remove the saved possible key position at the current flow level.
  310. if self.flow_level in self.possible_simple_keys:
  311. key = self.possible_simple_keys[self.flow_level]
  312. if key.required:
  313. raise ScannerError(
  314. 'while scanning a simple key',
  315. key.mark,
  316. "could not find expected ':'",
  317. self.reader.get_mark(),
  318. )
  319. del self.possible_simple_keys[self.flow_level]
  320. # Indentation functions.
  321. def unwind_indent(self, column: Any) -> None:
  322. # In flow context, tokens should respect indentation.
  323. # Actually the condition should be `self.indent >= column` according to
  324. # the spec. But this condition will prohibit intuitively correct
  325. # constructions such as
  326. # key : {
  327. # }
  328. # ####
  329. # if self.flow_level and self.indent > column:
  330. # raise ScannerError(None, None,
  331. # "invalid intendation or unclosed '[' or '{'",
  332. # self.reader.get_mark())
  333. # In the flow context, indentation is ignored. We make the scanner less
  334. # restrictive then specification requires.
  335. if bool(self.flow_level):
  336. return
  337. # In block context, we may need to issue the BLOCK-END tokens.
  338. while self.indent > column:
  339. mark = self.reader.get_mark()
  340. self.indent = self.indents.pop()
  341. self.tokens.append(tokens.BlockEndToken(mark, mark))
  342. def add_indent(self, column: int) -> bool:
  343. # Check if we need to increase indentation.
  344. if self.indent < column:
  345. self.indents.append(self.indent)
  346. self.indent = column
  347. return True
  348. return False
  349. # Fetchers.
  350. def fetch_stream_start(self) -> None:
  351. # We always add STREAM-START as the first token and STREAM-END as the
  352. # last token.
  353. # Read the token.
  354. mark = self.reader.get_mark()
  355. # Add STREAM-START.
  356. self.tokens.append(tokens.StreamStartToken(mark, mark, encoding=self.reader.encoding))
  357. def fetch_stream_end(self) -> None:
  358. # Set the current intendation to -1.
  359. self.unwind_indent(-1)
  360. # Reset simple keys.
  361. self.remove_possible_simple_key()
  362. self.allow_simple_key = False
  363. self.possible_simple_keys = {}
  364. # Read the token.
  365. mark = self.reader.get_mark()
  366. # Add STREAM-END.
  367. self.tokens.append(tokens.StreamEndToken(mark, mark))
  368. # The steam is finished.
  369. self.done = True
  370. def fetch_directive(self) -> None:
  371. # Set the current intendation to -1.
  372. self.unwind_indent(-1)
  373. # Reset simple keys.
  374. self.remove_possible_simple_key()
  375. self.allow_simple_key = False
  376. # Scan and add DIRECTIVE.
  377. self.tokens.append(self.scan_directive())
  378. def fetch_document_start(self) -> None:
  379. self.fetch_document_indicator(tokens.DocumentStartToken)
  380. def fetch_document_end(self) -> None:
  381. self.fetch_document_indicator(tokens.DocumentEndToken)
  382. def fetch_document_indicator(self, TokenClass: Any) -> None:
  383. # Set the current intendation to -1.
  384. self.unwind_indent(-1)
  385. # Reset simple keys. Note that there could not be a block collection
  386. # after '---'.
  387. self.remove_possible_simple_key()
  388. self.allow_simple_key = False
  389. # Add DOCUMENT-START or DOCUMENT-END.
  390. start_mark = self.reader.get_mark()
  391. self.reader.forward(3)
  392. end_mark = self.reader.get_mark()
  393. self.tokens.append(TokenClass(start_mark, end_mark))
  394. def fetch_flow_sequence_start(self) -> None:
  395. self.fetch_flow_collection_start(tokens.FlowSequenceStartToken, to_push='[')
  396. def fetch_flow_mapping_start(self) -> None:
  397. self.fetch_flow_collection_start(tokens.FlowMappingStartToken, to_push='{')
  398. def fetch_flow_collection_start(self, TokenClass: Any, to_push: Text) -> None:
  399. # '[' and '{' may start a simple key.
  400. self.save_possible_simple_key()
  401. # Increase the flow level.
  402. self.flow_context.append(to_push)
  403. # Simple keys are allowed after '[' and '{'.
  404. self.allow_simple_key = True
  405. # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
  406. start_mark = self.reader.get_mark()
  407. self.reader.forward()
  408. end_mark = self.reader.get_mark()
  409. self.tokens.append(TokenClass(start_mark, end_mark))
  410. def fetch_flow_sequence_end(self) -> None:
  411. self.fetch_flow_collection_end(tokens.FlowSequenceEndToken)
  412. def fetch_flow_mapping_end(self) -> None:
  413. self.fetch_flow_collection_end(tokens.FlowMappingEndToken)
  414. def fetch_flow_collection_end(self, TokenClass: Any) -> None:
  415. # Reset possible simple key on the current level.
  416. self.remove_possible_simple_key()
  417. # Decrease the flow level.
  418. try:
  419. popped = self.flow_context.pop() # NOQA
  420. except IndexError:
  421. # We must not be in a list or object.
  422. # Defer error handling to the parser.
  423. pass
  424. # No simple keys after ']' or '}'.
  425. self.allow_simple_key = False
  426. # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
  427. start_mark = self.reader.get_mark()
  428. self.reader.forward()
  429. end_mark = self.reader.get_mark()
  430. self.tokens.append(TokenClass(start_mark, end_mark))
  431. def fetch_flow_entry(self) -> None:
  432. # Simple keys are allowed after ','.
  433. self.allow_simple_key = True
  434. # Reset possible simple key on the current level.
  435. self.remove_possible_simple_key()
  436. # Add FLOW-ENTRY.
  437. start_mark = self.reader.get_mark()
  438. self.reader.forward()
  439. end_mark = self.reader.get_mark()
  440. self.tokens.append(tokens.FlowEntryToken(start_mark, end_mark))
  441. def fetch_block_entry(self) -> None:
  442. # Block context needs additional checks.
  443. if not self.flow_level:
  444. # Are we allowed to start a new entry?
  445. if not self.allow_simple_key:
  446. raise ScannerError(
  447. None,
  448. None,
  449. 'sequence entries are not allowed here',
  450. self.reader.get_mark(),
  451. )
  452. # We may need to add BLOCK-SEQUENCE-START.
  453. if self.add_indent(self.reader.column):
  454. mark = self.reader.get_mark()
  455. self.tokens.append(tokens.BlockSequenceStartToken(mark, mark))
  456. # It's an error for the block entry to occur in the flow context,
  457. # but we let the parser detect this.
  458. else:
  459. pass
  460. # Simple keys are allowed after '-'.
  461. self.allow_simple_key = True
  462. # Reset possible simple key on the current level.
  463. self.remove_possible_simple_key()
  464. # Add BLOCK-ENTRY.
  465. start_mark = self.reader.get_mark()
  466. self.reader.forward()
  467. end_mark = self.reader.get_mark()
  468. self.tokens.append(tokens.BlockEntryToken(start_mark, end_mark))
  469. def fetch_key(self) -> None:
  470. # Block context needs additional checks.
  471. if not self.flow_level:
  472. # Are we allowed to start a key (not nessesary a simple)?
  473. if not self.allow_simple_key:
  474. raise ScannerError(
  475. None, None, 'mapping keys are not allowed here', self.reader.get_mark(),
  476. )
  477. # We may need to add BLOCK-MAPPING-START.
  478. if self.add_indent(self.reader.column):
  479. mark = self.reader.get_mark()
  480. self.tokens.append(tokens.BlockMappingStartToken(mark, mark))
  481. # Simple keys are allowed after '?' in the block context.
  482. self.allow_simple_key = not self.flow_level
  483. # Reset possible simple key on the current level.
  484. self.remove_possible_simple_key()
  485. # Add KEY.
  486. start_mark = self.reader.get_mark()
  487. self.reader.forward()
  488. end_mark = self.reader.get_mark()
  489. self.tokens.append(tokens.KeyToken(start_mark, end_mark))
  490. def fetch_value(self) -> None:
  491. # Do we determine a simple key?
  492. if self.flow_level in self.possible_simple_keys:
  493. # Add KEY.
  494. key = self.possible_simple_keys[self.flow_level]
  495. del self.possible_simple_keys[self.flow_level]
  496. self.tokens.insert(
  497. key.token_number - self.tokens_taken, tokens.KeyToken(key.mark, key.mark),
  498. )
  499. # If this key starts a new block mapping, we need to add
  500. # BLOCK-MAPPING-START.
  501. if not self.flow_level:
  502. if self.add_indent(key.column):
  503. self.tokens.insert(
  504. key.token_number - self.tokens_taken,
  505. tokens.BlockMappingStartToken(key.mark, key.mark),
  506. )
  507. # There cannot be two simple keys one after another.
  508. self.allow_simple_key = False
  509. # It must be a part of a complex key.
  510. else:
  511. # Block context needs additional checks.
  512. # (Do we really need them? They will be caught by the parser
  513. # anyway.)
  514. if not self.flow_level:
  515. # We are allowed to start a complex value if and only if
  516. # we can start a simple key.
  517. if not self.allow_simple_key:
  518. raise ScannerError(
  519. None,
  520. None,
  521. 'mapping values are not allowed here',
  522. self.reader.get_mark(),
  523. )
  524. # If this value starts a new block mapping, we need to add
  525. # BLOCK-MAPPING-START. It will be detected as an error later by
  526. # the parser.
  527. if not self.flow_level:
  528. if self.add_indent(self.reader.column):
  529. mark = self.reader.get_mark()
  530. self.tokens.append(tokens.BlockMappingStartToken(mark, mark))
  531. # Simple keys are allowed after ':' in the block context.
  532. self.allow_simple_key = not self.flow_level
  533. # Reset possible simple key on the current level.
  534. self.remove_possible_simple_key()
  535. # Add VALUE.
  536. start_mark = self.reader.get_mark()
  537. self.reader.forward()
  538. end_mark = self.reader.get_mark()
  539. self.tokens.append(tokens.ValueToken(start_mark, end_mark))
  540. def fetch_alias(self) -> None:
  541. # ALIAS could be a simple key.
  542. self.save_possible_simple_key()
  543. # No simple keys after ALIAS.
  544. self.allow_simple_key = False
  545. # Scan and add ALIAS.
  546. self.tokens.append(self.scan_anchor(tokens.AliasToken))
  547. def fetch_anchor(self) -> None:
  548. # ANCHOR could start a simple key.
  549. self.save_possible_simple_key()
  550. # No simple keys after ANCHOR.
  551. self.allow_simple_key = False
  552. # Scan and add ANCHOR.
  553. self.tokens.append(self.scan_anchor(tokens.AnchorToken))
  554. def fetch_tag(self) -> None:
  555. # TAG could start a simple key.
  556. self.save_possible_simple_key()
  557. # No simple keys after TAG.
  558. self.allow_simple_key = False
  559. # Scan and add TAG.
  560. self.tokens.append(self.scan_tag())
  561. def fetch_literal(self) -> None:
  562. self.fetch_block_scalar(style='|')
  563. def fetch_folded(self) -> None:
  564. self.fetch_block_scalar(style='>')
  565. def fetch_block_scalar(self, style: Any) -> None:
  566. # A simple key may follow a block scalar.
  567. self.allow_simple_key = True
  568. # Reset possible simple key on the current level.
  569. self.remove_possible_simple_key()
  570. # Scan and add SCALAR.
  571. self.tokens.append(self.scan_block_scalar(style))
  572. def fetch_single(self) -> None:
  573. self.fetch_flow_scalar(style="'")
  574. def fetch_double(self) -> None:
  575. self.fetch_flow_scalar(style='"')
  576. def fetch_flow_scalar(self, style: Any) -> None:
  577. # A flow scalar could be a simple key.
  578. self.save_possible_simple_key()
  579. # No simple keys after flow scalars.
  580. self.allow_simple_key = False
  581. # Scan and add SCALAR.
  582. self.tokens.append(self.scan_flow_scalar(style))
  583. def fetch_plain(self) -> None:
  584. # A plain scalar could be a simple key.
  585. self.save_possible_simple_key()
  586. # No simple keys after plain scalars. But note that `scan_plain` will
  587. # change this flag if the scan is finished at the beginning of the
  588. # line.
  589. self.allow_simple_key = False
  590. # Scan and add SCALAR. May change `allow_simple_key`.
  591. self.tokens.append(self.scan_plain())
  592. # Checkers.
  593. def check_directive(self) -> Any:
  594. # DIRECTIVE: ^ '%' ...
  595. # The '%' indicator is already checked.
  596. if self.reader.column == 0:
  597. return True
  598. return None
  599. def check_document_start(self) -> Any:
  600. # DOCUMENT-START: ^ '---' (' '|'\n')
  601. if self.reader.column == 0:
  602. if self.reader.prefix(3) == '---' and self.reader.peek(3) in _THE_END_SPACE_TAB:
  603. return True
  604. return None
  605. def check_document_end(self) -> Any:
  606. # DOCUMENT-END: ^ '...' (' '|'\n')
  607. if self.reader.column == 0:
  608. if self.reader.prefix(3) == '...' and self.reader.peek(3) in _THE_END_SPACE_TAB:
  609. return True
  610. return None
  611. def check_block_entry(self) -> Any:
  612. # BLOCK-ENTRY: '-' (' '|'\n')
  613. return self.reader.peek(1) in _THE_END_SPACE_TAB
  614. def check_key(self) -> Any:
  615. # KEY(flow context): '?'
  616. if bool(self.flow_level):
  617. return True
  618. # KEY(block context): '?' (' '|'\n')
  619. return self.reader.peek(1) in _THE_END_SPACE_TAB
  620. def check_value(self) -> Any:
  621. # VALUE(flow context): ':'
  622. if self.scanner_processing_version == (1, 1):
  623. if bool(self.flow_level):
  624. return True
  625. else:
  626. if bool(self.flow_level):
  627. if self.flow_context[-1] == '[':
  628. if self.reader.peek(1) not in _THE_END_SPACE_TAB:
  629. return False
  630. elif self.tokens and isinstance(self.tokens[-1], tokens.ValueToken):
  631. # mapping flow context scanning a value token
  632. if self.reader.peek(1) not in _THE_END_SPACE_TAB:
  633. return False
  634. return True
  635. # VALUE(block context): ':' (' '|'\n')
  636. return self.reader.peek(1) in _THE_END_SPACE_TAB
  637. def check_plain(self) -> Any:
  638. # A plain scalar may start with any non-space character except:
  639. # '-', '?', ':', ',', '[', ']', '{', '}',
  640. # '#', '&', '*', '!', '|', '>', '\'', '\"',
  641. # '%', '@', '`'.
  642. #
  643. # It may also start with
  644. # '-', '?', ':'
  645. # if it is followed by a non-space character.
  646. #
  647. # Note that we limit the last rule to the block context (except the
  648. # '-' character) because we want the flow context to be space
  649. # independent.
  650. srp = self.reader.peek
  651. ch = srp()
  652. if self.scanner_processing_version == (1, 1):
  653. return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`' or (
  654. srp(1) not in _THE_END_SPACE_TAB
  655. and (ch == '-' or (not self.flow_level and ch in '?:'))
  656. )
  657. # YAML 1.2
  658. if ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`':
  659. # ################### ^ ???
  660. return True
  661. ch1 = srp(1)
  662. if ch == '-' and ch1 not in _THE_END_SPACE_TAB:
  663. return True
  664. if ch == ':' and bool(self.flow_level) and ch1 not in _SPACE_TAB:
  665. return True
  666. return srp(1) not in _THE_END_SPACE_TAB and (
  667. ch == '-' or (not self.flow_level and ch in '?:')
  668. )
  669. # Scanners.
  670. def scan_to_next_token(self) -> Any:
  671. # We ignore spaces, line breaks and comments.
  672. # If we find a line break in the block context, we set the flag
  673. # `allow_simple_key` on.
  674. # The byte order mark is stripped if it's the first character in the
  675. # stream. We do not yet support BOM inside the stream as the
  676. # specification requires. Any such mark will be considered as a part
  677. # of the document.
  678. #
  679. # TODO: We need to make tab handling rules more sane. A good rule is
  680. # Tabs cannot precede tokens
  681. # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
  682. # KEY(block), VALUE(block), BLOCK-ENTRY
  683. # So the checking code is
  684. # if <TAB>:
  685. # self.allow_simple_keys = False
  686. # We also need to add the check for `allow_simple_keys == True` to
  687. # `unwind_indent` before issuing BLOCK-END.
  688. # Scanners for block, flow, and plain scalars need to be modified.
  689. srp = self.reader.peek
  690. srf = self.reader.forward
  691. if self.reader.index == 0 and srp() == '\uFEFF':
  692. srf()
  693. found = False
  694. _the_end = _THE_END
  695. white_space = ' \t' if self.flow_level > 0 else ' '
  696. while not found:
  697. while srp() in white_space:
  698. srf()
  699. if srp() == '#':
  700. while srp() not in _the_end:
  701. srf()
  702. if self.scan_line_break():
  703. if not self.flow_level:
  704. self.allow_simple_key = True
  705. else:
  706. found = True
  707. return None
  708. def scan_directive(self) -> Any:
  709. # See the specification for details.
  710. srp = self.reader.peek
  711. srf = self.reader.forward
  712. start_mark = self.reader.get_mark()
  713. srf()
  714. name = self.scan_directive_name(start_mark)
  715. value = None
  716. if name == 'YAML':
  717. value = self.scan_yaml_directive_value(start_mark)
  718. end_mark = self.reader.get_mark()
  719. elif name == 'TAG':
  720. value = self.scan_tag_directive_value(start_mark)
  721. end_mark = self.reader.get_mark()
  722. else:
  723. end_mark = self.reader.get_mark()
  724. while srp() not in _THE_END:
  725. srf()
  726. self.scan_directive_ignored_line(start_mark)
  727. return tokens.DirectiveToken(name, value, start_mark, end_mark)
  728. def scan_directive_name(self, start_mark: Any) -> Any:
  729. # See the specification for details.
  730. length = 0
  731. srp = self.reader.peek
  732. ch = srp(length)
  733. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_:.':
  734. length += 1
  735. ch = srp(length)
  736. if not length:
  737. raise ScannerError(
  738. 'while scanning a directive',
  739. start_mark,
  740. f'expected alphabetic or numeric character, but found {ch!r}',
  741. self.reader.get_mark(),
  742. )
  743. value = self.reader.prefix(length)
  744. self.reader.forward(length)
  745. ch = srp()
  746. if ch not in '\0 \r\n\x85\u2028\u2029':
  747. raise ScannerError(
  748. 'while scanning a directive',
  749. start_mark,
  750. f'expected alphabetic or numeric character, but found {ch!r}',
  751. self.reader.get_mark(),
  752. )
  753. return value
  754. def scan_yaml_directive_value(self, start_mark: Any) -> Any:
  755. # See the specification for details.
  756. srp = self.reader.peek
  757. srf = self.reader.forward
  758. while srp() == ' ':
  759. srf()
  760. major = self.scan_yaml_directive_number(start_mark)
  761. if srp() != '.':
  762. raise ScannerError(
  763. 'while scanning a directive',
  764. start_mark,
  765. f"expected a digit or '.', but found {srp()!r}",
  766. self.reader.get_mark(),
  767. )
  768. srf()
  769. minor = self.scan_yaml_directive_number(start_mark)
  770. if srp() not in '\0 \r\n\x85\u2028\u2029':
  771. raise ScannerError(
  772. 'while scanning a directive',
  773. start_mark,
  774. f"expected a digit or '.', but found {srp()!r}",
  775. self.reader.get_mark(),
  776. )
  777. self.yaml_version = (major, minor)
  778. self.loader.doc_infos[-1].doc_version = Version(major, minor)
  779. return self.yaml_version
  780. def scan_yaml_directive_number(self, start_mark: Any) -> Any:
  781. # See the specification for details.
  782. srp = self.reader.peek
  783. srf = self.reader.forward
  784. ch = srp()
  785. if not ('0' <= ch <= '9'):
  786. raise ScannerError(
  787. 'while scanning a directive',
  788. start_mark,
  789. f'expected a digit, but found {ch!r}',
  790. self.reader.get_mark(),
  791. )
  792. length = 0
  793. while '0' <= srp(length) <= '9':
  794. length += 1
  795. value = int(self.reader.prefix(length))
  796. srf(length)
  797. return value
  798. def scan_tag_directive_value(self, start_mark: Any) -> Any:
  799. # See the specification for details.
  800. srp = self.reader.peek
  801. srf = self.reader.forward
  802. while srp() == ' ':
  803. srf()
  804. handle = self.scan_tag_directive_handle(start_mark)
  805. while srp() == ' ':
  806. srf()
  807. prefix = self.scan_tag_directive_prefix(start_mark)
  808. ret_val = (handle, prefix)
  809. self.tag_directives.append(ret_val)
  810. return ret_val
  811. def scan_tag_directive_handle(self, start_mark: Any) -> Any:
  812. # See the specification for details.
  813. value = self.scan_tag_handle('directive', start_mark)
  814. ch = self.reader.peek()
  815. if ch != ' ':
  816. raise ScannerError(
  817. 'while scanning a directive',
  818. start_mark,
  819. f"expected ' ', but found {ch!r}",
  820. self.reader.get_mark(),
  821. )
  822. return value
  823. def scan_tag_directive_prefix(self, start_mark: Any) -> Any:
  824. # See the specification for details.
  825. value = self.scan_tag_uri('directive', start_mark)
  826. ch = self.reader.peek()
  827. if ch not in '\0 \r\n\x85\u2028\u2029':
  828. raise ScannerError(
  829. 'while scanning a directive',
  830. start_mark,
  831. f"expected ' ', but found {ch!r}",
  832. self.reader.get_mark(),
  833. )
  834. return value
  835. def scan_directive_ignored_line(self, start_mark: Any) -> None:
  836. # See the specification for details.
  837. srp = self.reader.peek
  838. srf = self.reader.forward
  839. while srp() == ' ':
  840. srf()
  841. if srp() == '#':
  842. while srp() not in _THE_END:
  843. srf()
  844. ch = srp()
  845. if ch not in _THE_END:
  846. raise ScannerError(
  847. 'while scanning a directive',
  848. start_mark,
  849. f'expected a comment or a line break, but found {ch!r}',
  850. self.reader.get_mark(),
  851. )
  852. self.scan_line_break()
  853. def scan_anchor(self, TokenClass: Any) -> Any:
  854. # The specification does not restrict characters for anchors and
  855. # aliases. This may lead to problems, for instance, the document:
  856. # [ *alias, value ]
  857. # can be interpteted in two ways, as
  858. # [ "value" ]
  859. # and
  860. # [ *alias , "value" ]
  861. # Therefore we restrict aliases to numbers and ASCII letters.
  862. srp = self.reader.peek
  863. start_mark = self.reader.get_mark()
  864. indicator = srp()
  865. if indicator == '*':
  866. name = 'alias'
  867. else:
  868. name = 'anchor'
  869. self.reader.forward()
  870. length = 0
  871. ch = srp(length)
  872. # while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  873. # or ch in '-_':
  874. while check_anchorname_char(ch):
  875. length += 1
  876. ch = srp(length)
  877. if not length:
  878. raise ScannerError(
  879. f'while scanning an {name!s}',
  880. start_mark,
  881. f'expected alphabetic or numeric character, but found {ch!r}',
  882. self.reader.get_mark(),
  883. )
  884. value = self.reader.prefix(length)
  885. self.reader.forward(length)
  886. # ch1 = ch
  887. # ch = srp() # no need to peek, ch is already set
  888. # assert ch1 == ch
  889. if ch not in '\0 \t\r\n\x85\u2028\u2029?:,[]{}%@`':
  890. raise ScannerError(
  891. f'while scanning an {name!s}',
  892. start_mark,
  893. f'expected alphabetic or numeric character, but found {ch!r}',
  894. self.reader.get_mark(),
  895. )
  896. end_mark = self.reader.get_mark()
  897. return TokenClass(value, start_mark, end_mark)
  898. def scan_tag(self) -> Any:
  899. # See the specification for details.
  900. srp = self.reader.peek
  901. start_mark = self.reader.get_mark()
  902. ch = srp(1)
  903. short_handle = '!'
  904. if ch == '!':
  905. short_handle = '!!'
  906. self.reader.forward()
  907. srp = self.reader.peek
  908. ch = srp(1)
  909. if ch == '<':
  910. handle = None
  911. self.reader.forward(2)
  912. suffix = self.scan_tag_uri('tag', start_mark)
  913. if srp() != '>':
  914. raise ScannerError(
  915. 'while parsing a tag',
  916. start_mark,
  917. f"expected '>' but found {srp()!r}",
  918. self.reader.get_mark(),
  919. )
  920. self.reader.forward()
  921. elif ch in _THE_END_SPACE_TAB:
  922. handle = None
  923. suffix = short_handle
  924. self.reader.forward()
  925. else:
  926. length = 1
  927. use_handle = False
  928. while ch not in '\0 \r\n\x85\u2028\u2029':
  929. if ch == '!':
  930. use_handle = True
  931. break
  932. length += 1
  933. ch = srp(length)
  934. handle = short_handle
  935. if use_handle:
  936. handle = self.scan_tag_handle('tag', start_mark)
  937. else:
  938. handle = short_handle
  939. self.reader.forward()
  940. suffix = self.scan_tag_uri('tag', start_mark)
  941. ch = srp()
  942. if ch not in '\0 \r\n\x85\u2028\u2029':
  943. raise ScannerError(
  944. 'while scanning a tag',
  945. start_mark,
  946. f"expected ' ', but found {ch!r}",
  947. self.reader.get_mark(),
  948. )
  949. value = (handle, suffix)
  950. end_mark = self.reader.get_mark()
  951. return tokens.TagToken(value, start_mark, end_mark)
  952. def scan_block_scalar(self, style: Any, rt: Optional[bool] = False) -> Any:
  953. # See the specification for details.
  954. srp = self.reader.peek
  955. if style == '>':
  956. folded = True
  957. else:
  958. folded = False
  959. chunks: List[Any] = []
  960. start_mark = self.reader.get_mark()
  961. # Scan the header.
  962. self.reader.forward()
  963. chomping, increment = self.scan_block_scalar_indicators(start_mark)
  964. # block scalar comment e.g. : |+ # comment text
  965. block_scalar_comment = self.scan_block_scalar_ignored_line(start_mark)
  966. # Determine the indentation level and go to the first non-empty line.
  967. min_indent = self.indent + 1
  968. if increment is None:
  969. # no increment and top level, min_indent could be 0
  970. if min_indent < 1 and (
  971. style not in '|>'
  972. or (self.scanner_processing_version == (1, 1))
  973. and getattr(
  974. self.loader, 'top_level_block_style_scalar_no_indent_error_1_1', False,
  975. )
  976. ):
  977. min_indent = 1
  978. breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
  979. indent = max(min_indent, max_indent)
  980. else:
  981. if min_indent < 1:
  982. min_indent = 1
  983. indent = min_indent + increment - 1
  984. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  985. line_break = ""
  986. # Scan the inner part of the block scalar.
  987. while self.reader.column == indent and srp() != '\0':
  988. chunks.extend(breaks)
  989. leading_non_space = srp() not in ' \t'
  990. length = 0
  991. while srp(length) not in _THE_END:
  992. length += 1
  993. chunks.append(self.reader.prefix(length))
  994. self.reader.forward(length)
  995. line_break = self.scan_line_break()
  996. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  997. if style in '|>' and min_indent == 0:
  998. # at the beginning of a line, if in block style see if
  999. # end of document/start_new_document
  1000. if self.check_document_start() or self.check_document_end():
  1001. break
  1002. if self.reader.column == indent and srp() != '\0':
  1003. # Unfortunately, folding rules are ambiguous.
  1004. #
  1005. # This is the folding according to the specification:
  1006. if rt and folded and line_break == '\n':
  1007. chunks.append('\a')
  1008. if folded and line_break == '\n' and leading_non_space and srp() not in ' \t':
  1009. if not breaks:
  1010. chunks.append(' ')
  1011. else:
  1012. chunks.append(line_break)
  1013. # This is Clark Evans's interpretation (also in the spec
  1014. # examples):
  1015. #
  1016. # if folded and line_break == '\n':
  1017. # if not breaks:
  1018. # if srp() not in ' \t':
  1019. # chunks.append(' ')
  1020. # else:
  1021. # chunks.append(line_break)
  1022. # else:
  1023. # chunks.append(line_break)
  1024. else:
  1025. break
  1026. # Process trailing line breaks. The 'chomping' setting determines
  1027. # whether they are included in the value.
  1028. trailing: List[Any] = []
  1029. if chomping in [None, True]:
  1030. chunks.append(line_break)
  1031. if chomping is True:
  1032. chunks.extend(breaks)
  1033. elif chomping in [None, False]:
  1034. trailing.extend(breaks)
  1035. # We are done.
  1036. token = tokens.ScalarToken("".join(chunks), False, start_mark, end_mark, style)
  1037. if self.loader is not None:
  1038. comment_handler = getattr(self.loader, 'comment_handling', False)
  1039. if comment_handler is None:
  1040. if block_scalar_comment is not None:
  1041. token.add_pre_comments([block_scalar_comment])
  1042. if len(trailing) > 0:
  1043. # Eat whitespaces and comments until we reach the next token.
  1044. if self.loader is not None:
  1045. comment_handler = getattr(self.loader, 'comment_handling', None)
  1046. if comment_handler is not None:
  1047. line = end_mark.line - len(trailing)
  1048. for x in trailing:
  1049. assert x[-1] == '\n'
  1050. self.comments.add_blank_line(x, 0, line) # type: ignore
  1051. line += 1
  1052. comment = self.scan_to_next_token()
  1053. while comment:
  1054. trailing.append(' ' * comment[1].column + comment[0])
  1055. comment = self.scan_to_next_token()
  1056. if self.loader is not None:
  1057. comment_handler = getattr(self.loader, 'comment_handling', False)
  1058. if comment_handler is None:
  1059. # Keep track of the trailing whitespace and following comments
  1060. # as a comment token, if isn't all included in the actual value.
  1061. comment_end_mark = self.reader.get_mark()
  1062. comment = tokens.CommentToken("".join(trailing),
  1063. end_mark,
  1064. comment_end_mark)
  1065. token.add_post_comment(comment)
  1066. return token
  1067. def scan_block_scalar_indicators(self, start_mark: Any) -> Any:
  1068. # See the specification for details.
  1069. srp = self.reader.peek
  1070. chomping = None
  1071. increment = None
  1072. ch = srp()
  1073. if ch in '+-':
  1074. if ch == '+':
  1075. chomping = True
  1076. else:
  1077. chomping = False
  1078. self.reader.forward()
  1079. ch = srp()
  1080. if ch in '0123456789':
  1081. increment = int(ch)
  1082. if increment == 0:
  1083. raise ScannerError(
  1084. 'while scanning a block scalar',
  1085. start_mark,
  1086. 'expected indentation indicator in the range 1-9, ' 'but found 0',
  1087. self.reader.get_mark(),
  1088. )
  1089. self.reader.forward()
  1090. elif ch in '0123456789':
  1091. increment = int(ch)
  1092. if increment == 0:
  1093. raise ScannerError(
  1094. 'while scanning a block scalar',
  1095. start_mark,
  1096. 'expected indentation indicator in the range 1-9, ' 'but found 0',
  1097. self.reader.get_mark(),
  1098. )
  1099. self.reader.forward()
  1100. ch = srp()
  1101. if ch in '+-':
  1102. if ch == '+':
  1103. chomping = True
  1104. else:
  1105. chomping = False
  1106. self.reader.forward()
  1107. ch = srp()
  1108. if ch not in '\0 \r\n\x85\u2028\u2029':
  1109. raise ScannerError(
  1110. 'while scanning a block scalar',
  1111. start_mark,
  1112. f'expected chomping or indentation indicators, but found {ch!r}',
  1113. self.reader.get_mark(),
  1114. )
  1115. return chomping, increment
  1116. def scan_block_scalar_ignored_line(self, start_mark: Any) -> Any:
  1117. # See the specification for details.
  1118. srp = self.reader.peek
  1119. srf = self.reader.forward
  1120. prefix = ''
  1121. comment = None
  1122. while srp() == ' ':
  1123. prefix += srp()
  1124. srf()
  1125. if srp() == '#':
  1126. comment = prefix
  1127. while srp() not in _THE_END:
  1128. comment += srp()
  1129. srf()
  1130. ch = srp()
  1131. if ch not in _THE_END:
  1132. raise ScannerError(
  1133. 'while scanning a block scalar',
  1134. start_mark,
  1135. f'expected a comment or a line break, but found {ch!r}',
  1136. self.reader.get_mark(),
  1137. )
  1138. self.scan_line_break()
  1139. return comment
  1140. def scan_block_scalar_indentation(self) -> Any:
  1141. # See the specification for details.
  1142. srp = self.reader.peek
  1143. srf = self.reader.forward
  1144. chunks = []
  1145. first_indent = -1
  1146. max_indent = 0
  1147. end_mark = self.reader.get_mark()
  1148. while srp() in ' \r\n\x85\u2028\u2029':
  1149. if srp() != ' ':
  1150. if first_indent < 0:
  1151. first_indent = self.reader.column
  1152. chunks.append(self.scan_line_break())
  1153. end_mark = self.reader.get_mark()
  1154. else:
  1155. srf()
  1156. if self.reader.column > max_indent:
  1157. max_indent = self.reader.column
  1158. if first_indent > 0 and max_indent > first_indent:
  1159. start_mark = self.reader.get_mark()
  1160. raise ScannerError(
  1161. 'more indented follow up line than first in a block scalar', start_mark,
  1162. )
  1163. return chunks, max_indent, end_mark
  1164. def scan_block_scalar_breaks(self, indent: int) -> Any:
  1165. # See the specification for details.
  1166. chunks = []
  1167. srp = self.reader.peek
  1168. srf = self.reader.forward
  1169. end_mark = self.reader.get_mark()
  1170. while self.reader.column < indent and srp() == ' ':
  1171. srf()
  1172. while srp() in '\r\n\x85\u2028\u2029':
  1173. chunks.append(self.scan_line_break())
  1174. end_mark = self.reader.get_mark()
  1175. while self.reader.column < indent and srp() == ' ':
  1176. srf()
  1177. return chunks, end_mark
  1178. def scan_flow_scalar(self, style: Any) -> Any:
  1179. # See the specification for details.
  1180. # Note that we loose indentation rules for quoted scalars. Quoted
  1181. # scalars don't need to adhere indentation because " and ' clearly
  1182. # mark the beginning and the end of them. Therefore we are less
  1183. # restrictive then the specification requires. We only need to check
  1184. # that document separators are not included in scalars.
  1185. if style == '"':
  1186. double = True
  1187. else:
  1188. double = False
  1189. srp = self.reader.peek
  1190. chunks: List[Any] = []
  1191. start_mark = self.reader.get_mark()
  1192. quote = srp()
  1193. self.reader.forward()
  1194. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  1195. while srp() != quote:
  1196. chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
  1197. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  1198. self.reader.forward()
  1199. end_mark = self.reader.get_mark()
  1200. return tokens.ScalarToken("".join(chunks), False, start_mark, end_mark, style)
  1201. ESCAPE_REPLACEMENTS = {
  1202. '0': '\0',
  1203. 'a': '\x07',
  1204. 'b': '\x08',
  1205. 't': '\x09',
  1206. '\t': '\x09',
  1207. 'n': '\x0A',
  1208. 'v': '\x0B',
  1209. 'f': '\x0C',
  1210. 'r': '\x0D',
  1211. 'e': '\x1B',
  1212. ' ': '\x20',
  1213. '"': '"',
  1214. '/': '/', # as per http://www.json.org/
  1215. '\\': '\\',
  1216. 'N': '\x85',
  1217. '_': '\xA0',
  1218. 'L': '\u2028',
  1219. 'P': '\u2029',
  1220. }
  1221. ESCAPE_CODES = {'x': 2, 'u': 4, 'U': 8}
  1222. def scan_flow_scalar_non_spaces(self, double: Any, start_mark: Any) -> Any:
  1223. # See the specification for details.
  1224. chunks: List[Any] = []
  1225. srp = self.reader.peek
  1226. srf = self.reader.forward
  1227. while True:
  1228. length = 0
  1229. while srp(length) not in ' \n\'"\\\0\t\r\x85\u2028\u2029':
  1230. length += 1
  1231. if length != 0:
  1232. chunks.append(self.reader.prefix(length))
  1233. srf(length)
  1234. ch = srp()
  1235. if not double and ch == "'" and srp(1) == "'":
  1236. chunks.append("'")
  1237. srf(2)
  1238. elif (double and ch == "'") or (not double and ch in '"\\'):
  1239. chunks.append(ch)
  1240. srf()
  1241. elif double and ch == '\\':
  1242. srf()
  1243. ch = srp()
  1244. if ch in self.ESCAPE_REPLACEMENTS:
  1245. chunks.append(self.ESCAPE_REPLACEMENTS[ch])
  1246. srf()
  1247. elif ch in self.ESCAPE_CODES:
  1248. length = self.ESCAPE_CODES[ch]
  1249. srf()
  1250. for k in range(length):
  1251. if srp(k) not in '0123456789ABCDEFabcdef':
  1252. raise ScannerError(
  1253. 'while scanning a double-quoted scalar',
  1254. start_mark,
  1255. f'expected escape sequence of {length:d} '
  1256. f'hexdecimal numbers, but found {srp(k)!r}',
  1257. self.reader.get_mark(),
  1258. )
  1259. code = int(self.reader.prefix(length), 16)
  1260. chunks.append(chr(code))
  1261. srf(length)
  1262. elif ch in '\n\r\x85\u2028\u2029':
  1263. self.scan_line_break()
  1264. chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
  1265. else:
  1266. raise ScannerError(
  1267. 'while scanning a double-quoted scalar',
  1268. start_mark,
  1269. f'found unknown escape character {ch!r}',
  1270. self.reader.get_mark(),
  1271. )
  1272. else:
  1273. return chunks
  1274. def scan_flow_scalar_spaces(self, double: Any, start_mark: Any) -> Any:
  1275. # See the specification for details.
  1276. srp = self.reader.peek
  1277. chunks = []
  1278. length = 0
  1279. while srp(length) in ' \t':
  1280. length += 1
  1281. whitespaces = self.reader.prefix(length)
  1282. self.reader.forward(length)
  1283. ch = srp()
  1284. if ch == '\0':
  1285. raise ScannerError(
  1286. 'while scanning a quoted scalar',
  1287. start_mark,
  1288. 'found unexpected end of stream',
  1289. self.reader.get_mark(),
  1290. )
  1291. elif ch in '\r\n\x85\u2028\u2029':
  1292. line_break = self.scan_line_break()
  1293. breaks = self.scan_flow_scalar_breaks(double, start_mark)
  1294. if line_break != '\n':
  1295. chunks.append(line_break)
  1296. elif not breaks:
  1297. chunks.append(' ')
  1298. chunks.extend(breaks)
  1299. else:
  1300. chunks.append(whitespaces)
  1301. return chunks
  1302. def scan_flow_scalar_breaks(self, double: Any, start_mark: Any) -> Any:
  1303. # See the specification for details.
  1304. chunks: List[Any] = []
  1305. srp = self.reader.peek
  1306. srf = self.reader.forward
  1307. while True:
  1308. # Instead of checking indentation, we check for document
  1309. # separators.
  1310. prefix = self.reader.prefix(3)
  1311. if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
  1312. raise ScannerError(
  1313. 'while scanning a quoted scalar',
  1314. start_mark,
  1315. 'found unexpected document separator',
  1316. self.reader.get_mark(),
  1317. )
  1318. while srp() in ' \t':
  1319. srf()
  1320. if srp() in '\r\n\x85\u2028\u2029':
  1321. chunks.append(self.scan_line_break())
  1322. else:
  1323. return chunks
  1324. def scan_plain(self) -> Any:
  1325. # See the specification for details.
  1326. # We add an additional restriction for the flow context:
  1327. # plain scalars in the flow context cannot contain ',', ': ' and '?'.
  1328. # We also keep track of the `allow_simple_key` flag here.
  1329. # Indentation rules are loosed for the flow context.
  1330. srp = self.reader.peek
  1331. srf = self.reader.forward
  1332. chunks: List[Any] = []
  1333. start_mark = self.reader.get_mark()
  1334. end_mark = start_mark
  1335. indent = self.indent + 1
  1336. # We allow zero indentation for scalars, but then we need to check for
  1337. # document separators at the beginning of the line.
  1338. # if indent == 0:
  1339. # indent = 1
  1340. spaces: List[Any] = []
  1341. while True:
  1342. length = 0
  1343. if srp() == '#':
  1344. break
  1345. while True:
  1346. ch = srp(length)
  1347. if False and ch == ':' and srp(length + 1) == ',':
  1348. break
  1349. elif ch == ':' and srp(length + 1) not in _THE_END_SPACE_TAB:
  1350. pass
  1351. elif ch == '?' and self.scanner_processing_version != (1, 1):
  1352. pass
  1353. elif (
  1354. ch in _THE_END_SPACE_TAB
  1355. or (
  1356. not self.flow_level
  1357. and ch == ':'
  1358. and srp(length + 1) in _THE_END_SPACE_TAB
  1359. )
  1360. or (self.flow_level and ch in ',:?[]{}')
  1361. ):
  1362. break
  1363. length += 1
  1364. # It's not clear what we should do with ':' in the flow context.
  1365. if (
  1366. self.flow_level
  1367. and ch == ':'
  1368. and srp(length + 1) not in '\0 \t\r\n\x85\u2028\u2029,[]{}'
  1369. ):
  1370. srf(length)
  1371. raise ScannerError(
  1372. 'while scanning a plain scalar',
  1373. start_mark,
  1374. "found unexpected ':'",
  1375. self.reader.get_mark(),
  1376. 'Please check '
  1377. 'http://pyyaml.org/wiki/YAMLColonInFlowContext '
  1378. 'for details.',
  1379. )
  1380. if length == 0:
  1381. break
  1382. self.allow_simple_key = False
  1383. chunks.extend(spaces)
  1384. chunks.append(self.reader.prefix(length))
  1385. srf(length)
  1386. end_mark = self.reader.get_mark()
  1387. spaces = self.scan_plain_spaces(indent, start_mark)
  1388. if (
  1389. not spaces
  1390. or srp() == '#'
  1391. or (not self.flow_level and self.reader.column < indent)
  1392. ):
  1393. break
  1394. token = tokens.ScalarToken("".join(chunks), True, start_mark, end_mark)
  1395. # getattr provides True so C type loader, which cannot handle comment,
  1396. # will not make CommentToken
  1397. if self.loader is not None:
  1398. comment_handler = getattr(self.loader, 'comment_handling', False)
  1399. if comment_handler is None:
  1400. if spaces and spaces[0] == '\n':
  1401. # Create a comment token to preserve the trailing line breaks.
  1402. comment = tokens.CommentToken("".join(spaces) + '\n', start_mark, end_mark)
  1403. token.add_post_comment(comment)
  1404. elif comment_handler is not False:
  1405. line = start_mark.line + 1
  1406. for ch in spaces:
  1407. if ch == '\n':
  1408. self.comments.add_blank_line('\n', 0, line) # type: ignore
  1409. line += 1
  1410. return token
  1411. def scan_plain_spaces(self, indent: Any, start_mark: Any) -> Any:
  1412. # See the specification for details.
  1413. # The specification is really confusing about tabs in plain scalars.
  1414. # We just forbid them completely. Do not use tabs in YAML!
  1415. srp = self.reader.peek
  1416. srf = self.reader.forward
  1417. chunks = []
  1418. length = 0
  1419. while srp(length) in ' ':
  1420. length += 1
  1421. whitespaces = self.reader.prefix(length)
  1422. self.reader.forward(length)
  1423. ch = srp()
  1424. if ch in '\r\n\x85\u2028\u2029':
  1425. line_break = self.scan_line_break()
  1426. self.allow_simple_key = True
  1427. prefix = self.reader.prefix(3)
  1428. if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
  1429. return
  1430. breaks = []
  1431. while srp() in ' \r\n\x85\u2028\u2029':
  1432. if srp() == ' ':
  1433. srf()
  1434. else:
  1435. breaks.append(self.scan_line_break())
  1436. prefix = self.reader.prefix(3)
  1437. if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
  1438. return
  1439. if line_break != '\n':
  1440. chunks.append(line_break)
  1441. elif not breaks:
  1442. chunks.append(' ')
  1443. chunks.extend(breaks)
  1444. elif whitespaces:
  1445. chunks.append(whitespaces)
  1446. return chunks
  1447. def scan_tag_handle(self, name: Any, start_mark: Any) -> Any:
  1448. # See the specification for details.
  1449. # For some strange reasons, the specification does not allow '_' in
  1450. # tag handles. I have allowed it anyway.
  1451. srp = self.reader.peek
  1452. ch = srp()
  1453. if ch != '!':
  1454. raise ScannerError(
  1455. f'while scanning an {name!s}',
  1456. start_mark,
  1457. f"expected '!', but found {ch!r}",
  1458. self.reader.get_mark(),
  1459. )
  1460. length = 1
  1461. ch = srp(length)
  1462. if ch != ' ':
  1463. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_':
  1464. length += 1
  1465. ch = srp(length)
  1466. if ch != '!':
  1467. self.reader.forward(length)
  1468. raise ScannerError(
  1469. f'while scanning an {name!s}',
  1470. start_mark,
  1471. f"expected '!' but found {ch!r}",
  1472. self.reader.get_mark(),
  1473. )
  1474. length += 1
  1475. value = self.reader.prefix(length)
  1476. self.reader.forward(length)
  1477. return value
  1478. def scan_tag_uri(self, name: Any, start_mark: Any) -> Any:
  1479. # See the specification for details.
  1480. # Note: we do not check if URI is well-formed.
  1481. srp = self.reader.peek
  1482. chunks = []
  1483. length = 0
  1484. ch = srp(length)
  1485. while (
  1486. '0' <= ch <= '9'
  1487. or 'A' <= ch <= 'Z'
  1488. or 'a' <= ch <= 'z'
  1489. or ch in "-;/?:@&=+$,_.!~*'()[]%"
  1490. or ((self.scanner_processing_version > (1, 1)) and ch == '#')
  1491. ):
  1492. if ch == '%':
  1493. chunks.append(self.reader.prefix(length))
  1494. self.reader.forward(length)
  1495. length = 0
  1496. chunks.append(self.scan_uri_escapes(name, start_mark))
  1497. else:
  1498. length += 1
  1499. ch = srp(length)
  1500. if length != 0:
  1501. chunks.append(self.reader.prefix(length))
  1502. self.reader.forward(length)
  1503. length = 0
  1504. if not chunks:
  1505. raise ScannerError(
  1506. f'while parsing an {name!s}',
  1507. start_mark,
  1508. f'expected URI, but found {ch!r}',
  1509. self.reader.get_mark(),
  1510. )
  1511. return "".join(chunks)
  1512. def scan_uri_escapes(self, name: Any, start_mark: Any) -> Any:
  1513. # See the specification for details.
  1514. srp = self.reader.peek
  1515. srf = self.reader.forward
  1516. code_bytes: List[Any] = []
  1517. mark = self.reader.get_mark()
  1518. while srp() == '%':
  1519. srf()
  1520. for k in range(2):
  1521. if srp(k) not in '0123456789ABCDEFabcdef':
  1522. raise ScannerError(
  1523. f'while scanning an {name!s}',
  1524. start_mark,
  1525. f'expected URI escape sequence of 2 hexdecimal numbers, '
  1526. f'but found {srp(k)!r}',
  1527. self.reader.get_mark(),
  1528. )
  1529. code_bytes.append(int(self.reader.prefix(2), 16))
  1530. srf(2)
  1531. try:
  1532. value = bytes(code_bytes).decode('utf-8')
  1533. except UnicodeDecodeError as exc:
  1534. raise ScannerError(f'while scanning an {name!s}', start_mark, str(exc), mark)
  1535. return value
  1536. def scan_line_break(self) -> Any:
  1537. # Transforms:
  1538. # '\r\n' : '\n'
  1539. # '\r' : '\n'
  1540. # '\n' : '\n'
  1541. # '\x85' : '\n'
  1542. # '\u2028' : '\u2028'
  1543. # '\u2029 : '\u2029'
  1544. # default : ''
  1545. ch = self.reader.peek()
  1546. if ch in '\r\n\x85':
  1547. if self.reader.prefix(2) == '\r\n':
  1548. self.reader.forward(2)
  1549. else:
  1550. self.reader.forward()
  1551. return '\n'
  1552. elif ch in '\u2028\u2029':
  1553. self.reader.forward()
  1554. return ch
  1555. return ""
  1556. class RoundTripScanner(Scanner):
  1557. def check_token(self, *choices: Any) -> bool:
  1558. # Check if the next token is one of the given types.
  1559. while self.need_more_tokens():
  1560. self.fetch_more_tokens()
  1561. self._gather_comments()
  1562. if len(self.tokens) > 0:
  1563. if not choices:
  1564. return True
  1565. for choice in choices:
  1566. if isinstance(self.tokens[0], choice):
  1567. return True
  1568. return False
  1569. def peek_token(self) -> Any:
  1570. # Return the next token, but do not delete if from the queue.
  1571. while self.need_more_tokens():
  1572. self.fetch_more_tokens()
  1573. self._gather_comments()
  1574. if len(self.tokens) > 0:
  1575. return self.tokens[0]
  1576. return None
  1577. def _gather_comments(self) -> Any:
  1578. """combine multiple comment lines and assign to next non-comment-token"""
  1579. comments: List[Any] = []
  1580. if not self.tokens:
  1581. return comments
  1582. if isinstance(self.tokens[0], tokens.CommentToken):
  1583. comment = self.tokens.pop(0)
  1584. self.tokens_taken += 1
  1585. comments.append(comment)
  1586. while self.need_more_tokens():
  1587. self.fetch_more_tokens()
  1588. if not self.tokens:
  1589. return comments
  1590. if isinstance(self.tokens[0], tokens.CommentToken):
  1591. self.tokens_taken += 1
  1592. comment = self.tokens.pop(0)
  1593. # nprint('dropping2', comment)
  1594. comments.append(comment)
  1595. if len(comments) >= 1:
  1596. self.tokens[0].add_pre_comments(comments)
  1597. # pull in post comment on e.g. ':'
  1598. if not self.done and len(self.tokens) < 2:
  1599. self.fetch_more_tokens()
  1600. def get_token(self) -> Any:
  1601. # Return the next token.
  1602. while self.need_more_tokens():
  1603. self.fetch_more_tokens()
  1604. self._gather_comments()
  1605. if len(self.tokens) > 0:
  1606. # nprint('tk', self.tokens)
  1607. # only add post comment to single line tokens:
  1608. # scalar, value token. FlowXEndToken, otherwise
  1609. # hidden streamtokens could get them (leave them and they will be
  1610. # pre comments for the next map/seq
  1611. if (
  1612. len(self.tokens) > 1
  1613. and isinstance(
  1614. self.tokens[0],
  1615. (tokens.ScalarToken, tokens.ValueToken, tokens.FlowSequenceEndToken,
  1616. tokens.FlowMappingEndToken),
  1617. )
  1618. and isinstance(self.tokens[1], tokens.CommentToken)
  1619. and self.tokens[0].end_mark.line == self.tokens[1].start_mark.line
  1620. ):
  1621. self.tokens_taken += 1
  1622. c = self.tokens.pop(1)
  1623. self.fetch_more_tokens()
  1624. while len(self.tokens) > 1 and isinstance(self.tokens[1], tokens.CommentToken):
  1625. self.tokens_taken += 1
  1626. c1 = self.tokens.pop(1)
  1627. c.value = c.value + (' ' * c1.start_mark.column) + c1.value
  1628. self.fetch_more_tokens()
  1629. self.tokens[0].add_post_comment(c)
  1630. elif (
  1631. len(self.tokens) > 1
  1632. and isinstance(self.tokens[0], tokens.ScalarToken)
  1633. and isinstance(self.tokens[1], tokens.CommentToken)
  1634. and self.tokens[0].end_mark.line != self.tokens[1].start_mark.line
  1635. ):
  1636. self.tokens_taken += 1
  1637. c = self.tokens.pop(1)
  1638. c.value = (
  1639. '\n' * (c.start_mark.line - self.tokens[0].end_mark.line)
  1640. + (' ' * c.start_mark.column)
  1641. + c.value
  1642. )
  1643. self.tokens[0].add_post_comment(c)
  1644. self.fetch_more_tokens()
  1645. while len(self.tokens) > 1 and isinstance(self.tokens[1], tokens.CommentToken):
  1646. self.tokens_taken += 1
  1647. c1 = self.tokens.pop(1)
  1648. c.value = c.value + (' ' * c1.start_mark.column) + c1.value
  1649. self.fetch_more_tokens()
  1650. self.tokens_taken += 1
  1651. return self.tokens.pop(0)
  1652. return None
  1653. def fetch_comment(self, comment: Any) -> None:
  1654. value, start_mark, end_mark = comment
  1655. while value and value[-1] == ' ':
  1656. # empty line within indented key context
  1657. # no need to update end-mark, that is not used
  1658. value = value[:-1]
  1659. self.tokens.append(tokens.CommentToken(value, start_mark, end_mark))
  1660. # scanner
  1661. def scan_to_next_token(self) -> Any:
  1662. # We ignore spaces, line breaks and comments.
  1663. # If we find a line break in the block context, we set the flag
  1664. # `allow_simple_key` on.
  1665. # The byte order mark is stripped if it's the first character in the
  1666. # stream. We do not yet support BOM inside the stream as the
  1667. # specification requires. Any such mark will be considered as a part
  1668. # of the document.
  1669. #
  1670. # TODO: We need to make tab handling rules more sane. A good rule is
  1671. # Tabs cannot precede tokens
  1672. # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
  1673. # KEY(block), VALUE(block), BLOCK-ENTRY
  1674. # So the checking code is
  1675. # if <TAB>:
  1676. # self.allow_simple_keys = False
  1677. # We also need to add the check for `allow_simple_keys == True` to
  1678. # `unwind_indent` before issuing BLOCK-END.
  1679. # Scanners for block, flow, and plain scalars need to be modified.
  1680. srp = self.reader.peek
  1681. srf = self.reader.forward
  1682. if self.reader.index == 0 and srp() == '\uFEFF':
  1683. srf()
  1684. found = False
  1685. white_space = ' \t' if self.flow_level > 0 else ' '
  1686. while not found:
  1687. while srp() in white_space:
  1688. srf()
  1689. ch = srp()
  1690. if ch == '#':
  1691. start_mark = self.reader.get_mark()
  1692. comment = ch
  1693. srf()
  1694. while ch not in _THE_END:
  1695. ch = srp()
  1696. if ch == '\0': # don't gobble the end-of-stream character
  1697. # but add an explicit newline as "YAML processors should terminate
  1698. # the stream with an explicit line break
  1699. # https://yaml.org/spec/1.2/spec.html#id2780069
  1700. comment += '\n'
  1701. break
  1702. comment += ch
  1703. srf()
  1704. # gather any blank lines following the comment
  1705. ch = self.scan_line_break()
  1706. while len(ch) > 0:
  1707. comment += ch
  1708. ch = self.scan_line_break()
  1709. end_mark = self.reader.get_mark()
  1710. if not self.flow_level:
  1711. self.allow_simple_key = True
  1712. return comment, start_mark, end_mark
  1713. if self.scan_line_break() != '':
  1714. start_mark = self.reader.get_mark()
  1715. if not self.flow_level:
  1716. self.allow_simple_key = True
  1717. ch = srp()
  1718. if ch == '\n': # empty toplevel lines
  1719. start_mark = self.reader.get_mark()
  1720. comment = ""
  1721. while ch:
  1722. ch = self.scan_line_break(empty_line=True)
  1723. comment += ch
  1724. if srp() == '#':
  1725. # empty line followed by indented real comment
  1726. comment = comment.rsplit('\n', 1)[0] + '\n'
  1727. end_mark = self.reader.get_mark()
  1728. return comment, start_mark, end_mark
  1729. else:
  1730. found = True
  1731. return None
  1732. def scan_line_break(self, empty_line: bool = False) -> Text:
  1733. # Transforms:
  1734. # '\r\n' : '\n'
  1735. # '\r' : '\n'
  1736. # '\n' : '\n'
  1737. # '\x85' : '\n'
  1738. # '\u2028' : '\u2028'
  1739. # '\u2029 : '\u2029'
  1740. # default : ''
  1741. ch: Text = self.reader.peek()
  1742. if ch in '\r\n\x85':
  1743. if self.reader.prefix(2) == '\r\n':
  1744. self.reader.forward(2)
  1745. else:
  1746. self.reader.forward()
  1747. return '\n'
  1748. elif ch in '\u2028\u2029':
  1749. self.reader.forward()
  1750. return ch
  1751. elif empty_line and ch in '\t ':
  1752. self.reader.forward()
  1753. return ch
  1754. return ""
  1755. def scan_block_scalar(self, style: Any, rt: Optional[bool] = True) -> Any:
  1756. return Scanner.scan_block_scalar(self, style, rt=rt)
  1757. def scan_uri_escapes(self, name: Any, start_mark: Any) -> Any:
  1758. """
  1759. The roundtripscanner doesn't do URI escaping
  1760. """
  1761. # See the specification for details.
  1762. srp = self.reader.peek
  1763. srf = self.reader.forward
  1764. code_bytes: List[Any] = []
  1765. chunk = ''
  1766. mark = self.reader.get_mark()
  1767. while srp() == '%':
  1768. chunk += '%'
  1769. srf()
  1770. for k in range(2):
  1771. if srp(k) not in '0123456789ABCDEFabcdef':
  1772. raise ScannerError(
  1773. f'while scanning an {name!s}',
  1774. start_mark,
  1775. f'expected URI escape sequence of 2 hexdecimal numbers, '
  1776. f'but found {srp(k)!r}',
  1777. self.reader.get_mark(),
  1778. )
  1779. code_bytes.append(int(self.reader.prefix(2), 16))
  1780. chunk += self.reader.prefix(2)
  1781. srf(2)
  1782. try:
  1783. _ = bytes(code_bytes).decode('utf-8')
  1784. except UnicodeDecodeError as exc:
  1785. raise ScannerError(f'while scanning an {name!s}', start_mark, str(exc), mark)
  1786. return chunk
  1787. # commenthandling 2021, differentiatiation not needed
  1788. VALUECMNT = 0
  1789. KEYCMNT = 0 # 1
  1790. # TAGCMNT = 2
  1791. # ANCHORCMNT = 3
  1792. class CommentBase:
  1793. __slots__ = ('value', 'line', 'column', 'used', 'function', 'fline', 'ufun', 'uline')
  1794. def __init__(self, value: Any, line: Any, column: Any) -> None:
  1795. self.value = value
  1796. self.line = line
  1797. self.column = column
  1798. self.used = ' '
  1799. if _debug != 0:
  1800. import inspect
  1801. info = inspect.getframeinfo(inspect.stack()[3][0])
  1802. self.function = info.function
  1803. self.fline = info.lineno
  1804. self.ufun = None
  1805. self.uline = None
  1806. def set_used(self, v: Any = '+') -> None:
  1807. self.used = v
  1808. if _debug != 0:
  1809. import inspect
  1810. info = inspect.getframeinfo(inspect.stack()[1][0])
  1811. self.ufun = info.function # type: ignore
  1812. self.uline = info.lineno # type: ignore
  1813. def set_assigned(self) -> None:
  1814. self.used = '|'
  1815. def __str__(self) -> str:
  1816. return f'{self.value}'
  1817. def __repr__(self) -> str:
  1818. return f'{self.value!r}'
  1819. def info(self) -> str:
  1820. xv = self.value + '"'
  1821. name = self.name # type: ignore
  1822. return (
  1823. f'{name}{self.used} {self.line:2}:{self.column:<2} "{xv:40s} '
  1824. f'{self.function}:{self.fline} {self.ufun}:{self.uline}'
  1825. )
  1826. class EOLComment(CommentBase):
  1827. name = 'EOLC'
  1828. def __init__(self, value: Any, line: Any, column: Any) -> None:
  1829. super().__init__(value, line, column)
  1830. class FullLineComment(CommentBase):
  1831. name = 'FULL'
  1832. def __init__(self, value: Any, line: Any, column: Any) -> None:
  1833. super().__init__(value, line, column)
  1834. class BlankLineComment(CommentBase):
  1835. name = 'BLNK'
  1836. def __init__(self, value: Any, line: Any, column: Any) -> None:
  1837. super().__init__(value, line, column)
  1838. class ScannedComments:
  1839. def __init__(self: Any) -> None:
  1840. self.comments = {} # type: ignore
  1841. self.unused = [] # type: ignore
  1842. def add_eol_comment(self, comment: Any, column: Any, line: Any) -> Any:
  1843. # info = inspect.getframeinfo(inspect.stack()[1][0])
  1844. if comment.count('\n') == 1:
  1845. assert comment[-1] == '\n'
  1846. else:
  1847. assert '\n' not in comment
  1848. self.comments[line] = retval = EOLComment(comment[:-1], line, column)
  1849. self.unused.append(line)
  1850. return retval
  1851. def add_blank_line(self, comment: Any, column: Any, line: Any) -> Any:
  1852. # info = inspect.getframeinfo(inspect.stack()[1][0])
  1853. assert comment.count('\n') == 1 and comment[-1] == '\n'
  1854. assert line not in self.comments
  1855. self.comments[line] = retval = BlankLineComment(comment[:-1], line, column)
  1856. self.unused.append(line)
  1857. return retval
  1858. def add_full_line_comment(self, comment: Any, column: Any, line: Any) -> Any:
  1859. # info = inspect.getframeinfo(inspect.stack()[1][0])
  1860. assert comment.count('\n') == 1 and comment[-1] == '\n'
  1861. # if comment.startswith('# C12'):
  1862. # raise
  1863. # this raises in line 2127 fro 330
  1864. self.comments[line] = retval = FullLineComment(comment[:-1], line, column)
  1865. self.unused.append(line)
  1866. return retval
  1867. def __getitem__(self, idx: Any) -> Any:
  1868. return self.comments[idx]
  1869. def __str__(self) -> Any:
  1870. return (
  1871. 'ParsedComments:\n '
  1872. + '\n '.join((f'{lineno:2} {x.info()}' for lineno, x in self.comments.items()))
  1873. + '\n'
  1874. )
  1875. def last(self) -> str:
  1876. lineno, x = list(self.comments.items())[-1]
  1877. return f'{lineno:2} {x.info()}\n'
  1878. def any_unprocessed(self) -> bool:
  1879. # ToDo: might want to differentiate based on lineno
  1880. return len(self.unused) > 0
  1881. # for lno, comment in reversed(self.comments.items()):
  1882. # if comment.used == ' ':
  1883. # return True
  1884. # return False
  1885. def unprocessed(self, use: Any = False) -> Any:
  1886. while len(self.unused) > 0:
  1887. if _debug != 0:
  1888. import inspect
  1889. first = self.unused.pop(0) if use else self.unused[0]
  1890. info = inspect.getframeinfo(inspect.stack()[1][0])
  1891. xprintf('using', first, self.comments[first].value, info.function, info.lineno)
  1892. yield first, self.comments[first]
  1893. if use:
  1894. self.comments[first].set_used()
  1895. def assign_pre(self, token: Any) -> Any:
  1896. token_line = token.start_mark.line
  1897. if _debug != 0:
  1898. import inspect
  1899. info = inspect.getframeinfo(inspect.stack()[1][0])
  1900. xprintf('assign_pre', token_line, self.unused, info.function, info.lineno)
  1901. gobbled = False
  1902. while self.unused and self.unused[0] < token_line:
  1903. gobbled = True
  1904. first = self.unused.pop(0)
  1905. if _debug != 0:
  1906. xprintf('assign_pre < ', first)
  1907. self.comments[first].set_used()
  1908. token.add_comment_pre(first)
  1909. return gobbled
  1910. def assign_eol(self, tokens: Any) -> Any:
  1911. try:
  1912. comment_line = self.unused[0]
  1913. except IndexError:
  1914. return
  1915. if not isinstance(self.comments[comment_line], EOLComment):
  1916. return
  1917. idx = 1
  1918. while tokens[-idx].start_mark.line > comment_line or isinstance(
  1919. tokens[-idx], tokens.ValueToken,
  1920. ):
  1921. idx += 1
  1922. if _debug != 0:
  1923. xprintf('idx1', idx)
  1924. if (
  1925. len(tokens) > idx
  1926. and isinstance(tokens[-idx], tokens.ScalarToken)
  1927. and isinstance(tokens[-(idx + 1)], tokens.ScalarToken)
  1928. ):
  1929. return
  1930. try:
  1931. if isinstance(tokens[-idx], tokens.ScalarToken) and isinstance(
  1932. tokens[-(idx + 1)], tokens.KeyToken,
  1933. ):
  1934. try:
  1935. eol_idx = self.unused.pop(0)
  1936. self.comments[eol_idx].set_used()
  1937. if _debug != 0:
  1938. xprintf('>>>>>a', idx, eol_idx, KEYCMNT)
  1939. tokens[-idx].add_comment_eol(eol_idx, KEYCMNT)
  1940. except IndexError:
  1941. raise NotImplementedError
  1942. return
  1943. except IndexError:
  1944. if _debug != 0:
  1945. xprintf('IndexError1')
  1946. pass
  1947. try:
  1948. if isinstance(tokens[-idx], tokens.ScalarToken) and isinstance(
  1949. tokens[-(idx + 1)], (tokens.ValueToken, tokens.BlockEntryToken),
  1950. ):
  1951. try:
  1952. eol_idx = self.unused.pop(0)
  1953. self.comments[eol_idx].set_used()
  1954. tokens[-idx].add_comment_eol(eol_idx, VALUECMNT)
  1955. except IndexError:
  1956. raise NotImplementedError
  1957. return
  1958. except IndexError:
  1959. if _debug != 0:
  1960. xprintf('IndexError2')
  1961. pass
  1962. for t in tokens:
  1963. xprintf('tt-', t)
  1964. if _debug != 0:
  1965. xprintf('not implemented EOL', type(tokens[-idx]))
  1966. import sys
  1967. sys.exit(0)
  1968. def assign_post(self, token: Any) -> Any:
  1969. token_line = token.start_mark.line
  1970. if _debug != 0:
  1971. import inspect
  1972. info = inspect.getframeinfo(inspect.stack()[1][0])
  1973. xprintf('assign_post', token_line, self.unused, info.function, info.lineno)
  1974. gobbled = False
  1975. while self.unused and self.unused[0] < token_line:
  1976. gobbled = True
  1977. first = self.unused.pop(0)
  1978. if _debug != 0:
  1979. xprintf('assign_post < ', first)
  1980. self.comments[first].set_used()
  1981. token.add_comment_post(first)
  1982. return gobbled
  1983. def str_unprocessed(self) -> Any:
  1984. return ''.join(
  1985. (f' {ind:2} {x.info()}\n' for ind, x in self.comments.items() if x.used == ' '),
  1986. )
  1987. class RoundTripScannerSC(Scanner): # RoundTripScanner Split Comments
  1988. def __init__(self, *arg: Any, **kw: Any) -> None:
  1989. super().__init__(*arg, **kw)
  1990. assert self.loader is not None
  1991. # comments isinitialised on .need_more_tokens and persist on
  1992. # self.loader.parsed_comments
  1993. self.comments = None
  1994. def get_token(self) -> Any:
  1995. # Return the next token.
  1996. while self.need_more_tokens():
  1997. self.fetch_more_tokens()
  1998. if len(self.tokens) > 0:
  1999. if isinstance(self.tokens[0], tokens.BlockEndToken):
  2000. self.comments.assign_post(self.tokens[0]) # type: ignore
  2001. else:
  2002. self.comments.assign_pre(self.tokens[0]) # type: ignore
  2003. self.tokens_taken += 1
  2004. return self.tokens.pop(0)
  2005. def need_more_tokens(self) -> bool:
  2006. if self.comments is None:
  2007. self.loader.parsed_comments = self.comments = ScannedComments() # type: ignore
  2008. if self.done:
  2009. return False
  2010. if len(self.tokens) == 0:
  2011. return True
  2012. # The current token may be a potential simple key, so we
  2013. # need to look further.
  2014. self.stale_possible_simple_keys()
  2015. if self.next_possible_simple_key() == self.tokens_taken:
  2016. return True
  2017. if len(self.tokens) < 2:
  2018. return True
  2019. if self.tokens[0].start_mark.line == self.tokens[-1].start_mark.line:
  2020. return True
  2021. if True:
  2022. if _debug != 0:
  2023. xprintf('-x--', len(self.tokens))
  2024. for t in self.tokens:
  2025. xprintf(t)
  2026. # xprintf(self.comments.last())
  2027. xprintf(self.comments.str_unprocessed()) # type: ignore
  2028. self.comments.assign_pre(self.tokens[0]) # type: ignore
  2029. self.comments.assign_eol(self.tokens) # type: ignore
  2030. return False
  2031. def scan_to_next_token(self) -> None:
  2032. srp = self.reader.peek
  2033. srf = self.reader.forward
  2034. if self.reader.index == 0 and srp() == '\uFEFF':
  2035. srf()
  2036. start_mark = self.reader.get_mark()
  2037. # xprintf('current_mark', start_mark.line, start_mark.column)
  2038. found = False
  2039. while not found:
  2040. while srp() == ' ':
  2041. srf()
  2042. ch = srp()
  2043. if ch == '#':
  2044. comment_start_mark = self.reader.get_mark()
  2045. comment = ch
  2046. srf() # skipt the '#'
  2047. while ch not in _THE_END:
  2048. ch = srp()
  2049. if ch == '\0': # don't gobble the end-of-stream character
  2050. # but add an explicit newline as "YAML processors should terminate
  2051. # the stream with an explicit line break
  2052. # https://yaml.org/spec/1.2/spec.html#id2780069
  2053. comment += '\n'
  2054. break
  2055. comment += ch
  2056. srf()
  2057. # we have a comment
  2058. if start_mark.column == 0:
  2059. self.comments.add_full_line_comment( # type: ignore
  2060. comment, comment_start_mark.column, comment_start_mark.line,
  2061. )
  2062. else:
  2063. self.comments.add_eol_comment( # type: ignore
  2064. comment, comment_start_mark.column, comment_start_mark.line,
  2065. )
  2066. comment = ""
  2067. # gather any blank lines or full line comments following the comment as well
  2068. self.scan_empty_or_full_line_comments()
  2069. if not self.flow_level:
  2070. self.allow_simple_key = True
  2071. return
  2072. if bool(self.scan_line_break()):
  2073. # start_mark = self.reader.get_mark()
  2074. if not self.flow_level:
  2075. self.allow_simple_key = True
  2076. self.scan_empty_or_full_line_comments()
  2077. return None
  2078. ch = srp()
  2079. if ch == '\n': # empty toplevel lines
  2080. start_mark = self.reader.get_mark()
  2081. comment = ""
  2082. while ch:
  2083. ch = self.scan_line_break(empty_line=True)
  2084. comment += ch
  2085. if srp() == '#':
  2086. # empty line followed by indented real comment
  2087. comment = comment.rsplit('\n', 1)[0] + '\n'
  2088. _ = self.reader.get_mark() # gobble end_mark
  2089. return None
  2090. else:
  2091. found = True
  2092. return None
  2093. def scan_empty_or_full_line_comments(self) -> None:
  2094. blmark = self.reader.get_mark()
  2095. assert blmark.column == 0
  2096. blanks = ""
  2097. comment = None
  2098. mark = None
  2099. ch = self.reader.peek()
  2100. while True:
  2101. # nprint('ch', repr(ch), self.reader.get_mark().column)
  2102. if ch in '\r\n\x85\u2028\u2029':
  2103. if self.reader.prefix(2) == '\r\n':
  2104. self.reader.forward(2)
  2105. else:
  2106. self.reader.forward()
  2107. if comment is not None:
  2108. comment += '\n'
  2109. self.comments.add_full_line_comment(comment, mark.column, mark.line)
  2110. comment = None
  2111. else:
  2112. blanks += '\n'
  2113. self.comments.add_blank_line(blanks, blmark.column, blmark.line) # type: ignore # NOQA
  2114. blanks = ""
  2115. blmark = self.reader.get_mark()
  2116. ch = self.reader.peek()
  2117. continue
  2118. if comment is None:
  2119. if ch in ' \t':
  2120. blanks += ch
  2121. elif ch == '#':
  2122. mark = self.reader.get_mark()
  2123. comment = '#'
  2124. else:
  2125. # xprintf('breaking on', repr(ch))
  2126. break
  2127. else:
  2128. comment += ch
  2129. self.reader.forward()
  2130. ch = self.reader.peek()
  2131. def scan_block_scalar_ignored_line(self, start_mark: Any) -> Any:
  2132. # See the specification for details.
  2133. srp = self.reader.peek
  2134. srf = self.reader.forward
  2135. prefix = ''
  2136. comment = None
  2137. while srp() == ' ':
  2138. prefix += srp()
  2139. srf()
  2140. if srp() == '#':
  2141. comment = ''
  2142. mark = self.reader.get_mark()
  2143. while srp() not in _THE_END:
  2144. comment += srp()
  2145. srf()
  2146. comment += '\n' # type: ignore
  2147. ch = srp()
  2148. if ch not in _THE_END:
  2149. raise ScannerError(
  2150. 'while scanning a block scalar',
  2151. start_mark,
  2152. f'expected a comment or a line break, but found {ch!r}',
  2153. self.reader.get_mark(),
  2154. )
  2155. if comment is not None:
  2156. self.comments.add_eol_comment(comment, mark.column, mark.line) # type: ignore
  2157. self.scan_line_break()
  2158. return None