Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 
 

485 lignes
14 KiB

  1. #------------------------------------------------------------------------------
  2. # pycparser: c_lexer.py
  3. #
  4. # CLexer class: lexer for the C language
  5. #
  6. # Eli Bendersky [https://eli.thegreenplace.net/]
  7. # License: BSD
  8. #------------------------------------------------------------------------------
  9. import re
  10. import sys
  11. from .ply import lex
  12. from .ply.lex import TOKEN
  13. class CLexer(object):
  14. """ A lexer for the C language. After building it, set the
  15. input text with input(), and call token() to get new
  16. tokens.
  17. The public attribute filename can be set to an initial
  18. filaneme, but the lexer will update it upon #line
  19. directives.
  20. """
  21. def __init__(self, error_func, on_lbrace_func, on_rbrace_func,
  22. type_lookup_func):
  23. """ Create a new Lexer.
  24. error_func:
  25. An error function. Will be called with an error
  26. message, line and column as arguments, in case of
  27. an error during lexing.
  28. on_lbrace_func, on_rbrace_func:
  29. Called when an LBRACE or RBRACE is encountered
  30. (likely to push/pop type_lookup_func's scope)
  31. type_lookup_func:
  32. A type lookup function. Given a string, it must
  33. return True IFF this string is a name of a type
  34. that was defined with a typedef earlier.
  35. """
  36. self.error_func = error_func
  37. self.on_lbrace_func = on_lbrace_func
  38. self.on_rbrace_func = on_rbrace_func
  39. self.type_lookup_func = type_lookup_func
  40. self.filename = ''
  41. # Keeps track of the last token returned from self.token()
  42. self.last_token = None
  43. # Allow either "# line" or "# <num>" to support GCC's
  44. # cpp output
  45. #
  46. self.line_pattern = re.compile(r'([ \t]*line\W)|([ \t]*\d+)')
  47. self.pragma_pattern = re.compile(r'[ \t]*pragma\W')
  48. def build(self, **kwargs):
  49. """ Builds the lexer from the specification. Must be
  50. called after the lexer object is created.
  51. This method exists separately, because the PLY
  52. manual warns against calling lex.lex inside
  53. __init__
  54. """
  55. self.lexer = lex.lex(object=self, **kwargs)
  56. def reset_lineno(self):
  57. """ Resets the internal line number counter of the lexer.
  58. """
  59. self.lexer.lineno = 1
  60. def input(self, text):
  61. self.lexer.input(text)
  62. def token(self):
  63. self.last_token = self.lexer.token()
  64. return self.last_token
  65. def find_tok_column(self, token):
  66. """ Find the column of the token in its line.
  67. """
  68. last_cr = self.lexer.lexdata.rfind('\n', 0, token.lexpos)
  69. return token.lexpos - last_cr
  70. ######################-- PRIVATE --######################
  71. ##
  72. ## Internal auxiliary methods
  73. ##
  74. def _error(self, msg, token):
  75. location = self._make_tok_location(token)
  76. self.error_func(msg, location[0], location[1])
  77. self.lexer.skip(1)
  78. def _make_tok_location(self, token):
  79. return (token.lineno, self.find_tok_column(token))
  80. ##
  81. ## Reserved keywords
  82. ##
  83. keywords = (
  84. '_BOOL', '_COMPLEX', 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST',
  85. 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
  86. 'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG',
  87. 'REGISTER', 'OFFSETOF',
  88. 'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
  89. 'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
  90. 'VOLATILE', 'WHILE', '__INT128',
  91. )
  92. keyword_map = {}
  93. for keyword in keywords:
  94. if keyword == '_BOOL':
  95. keyword_map['_Bool'] = keyword
  96. elif keyword == '_COMPLEX':
  97. keyword_map['_Complex'] = keyword
  98. else:
  99. keyword_map[keyword.lower()] = keyword
  100. ##
  101. ## All the tokens recognized by the lexer
  102. ##
  103. tokens = keywords + (
  104. # Identifiers
  105. 'ID',
  106. # Type identifiers (identifiers previously defined as
  107. # types with typedef)
  108. 'TYPEID',
  109. # constants
  110. 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN',
  111. 'FLOAT_CONST', 'HEX_FLOAT_CONST',
  112. 'CHAR_CONST',
  113. 'WCHAR_CONST',
  114. # String literals
  115. 'STRING_LITERAL',
  116. 'WSTRING_LITERAL',
  117. # Operators
  118. 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
  119. 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
  120. 'LOR', 'LAND', 'LNOT',
  121. 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
  122. # Assignment
  123. 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL',
  124. 'PLUSEQUAL', 'MINUSEQUAL',
  125. 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL',
  126. 'OREQUAL',
  127. # Increment/decrement
  128. 'PLUSPLUS', 'MINUSMINUS',
  129. # Structure dereference (->)
  130. 'ARROW',
  131. # Conditional operator (?)
  132. 'CONDOP',
  133. # Delimeters
  134. 'LPAREN', 'RPAREN', # ( )
  135. 'LBRACKET', 'RBRACKET', # [ ]
  136. 'LBRACE', 'RBRACE', # { }
  137. 'COMMA', 'PERIOD', # . ,
  138. 'SEMI', 'COLON', # ; :
  139. # Ellipsis (...)
  140. 'ELLIPSIS',
  141. # pre-processor
  142. 'PPHASH', # '#'
  143. 'PPPRAGMA', # 'pragma'
  144. 'PPPRAGMASTR',
  145. )
  146. ##
  147. ## Regexes for use in tokens
  148. ##
  149. ##
  150. # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
  151. identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*'
  152. hex_prefix = '0[xX]'
  153. hex_digits = '[0-9a-fA-F]+'
  154. bin_prefix = '0[bB]'
  155. bin_digits = '[01]+'
  156. # integer constants (K&R2: A.2.5.1)
  157. integer_suffix_opt = r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?'
  158. decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')'
  159. octal_constant = '0[0-7]*'+integer_suffix_opt
  160. hex_constant = hex_prefix+hex_digits+integer_suffix_opt
  161. bin_constant = bin_prefix+bin_digits+integer_suffix_opt
  162. bad_octal_constant = '0[0-7]*[89]'
  163. # character constants (K&R2: A.2.5.2)
  164. # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
  165. # directives with Windows paths as filenames (..\..\dir\file)
  166. # For the same reason, decimal_escape allows all digit sequences. We want to
  167. # parse all correct code, even if it means to sometimes parse incorrect
  168. # code.
  169. #
  170. simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
  171. decimal_escape = r"""(\d+)"""
  172. hex_escape = r"""(x[0-9a-fA-F]+)"""
  173. bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
  174. escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
  175. cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
  176. char_const = "'"+cconst_char+"'"
  177. wchar_const = 'L'+char_const
  178. unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
  179. bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""
  180. # string literals (K&R2: A.2.6)
  181. string_char = r"""([^"\\\n]|"""+escape_sequence+')'
  182. string_literal = '"'+string_char+'*"'
  183. wstring_literal = 'L'+string_literal
  184. bad_string_literal = '"'+string_char+'*?'+bad_escape+string_char+'*"'
  185. # floating constants (K&R2: A.2.5.3)
  186. exponent_part = r"""([eE][-+]?[0-9]+)"""
  187. fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
  188. floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)'
  189. binary_exponent_part = r'''([pP][+-]?[0-9]+)'''
  190. hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))"""
  191. hex_floating_constant = '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)'
  192. ##
  193. ## Lexer states: used for preprocessor \n-terminated directives
  194. ##
  195. states = (
  196. # ppline: preprocessor line directives
  197. #
  198. ('ppline', 'exclusive'),
  199. # pppragma: pragma
  200. #
  201. ('pppragma', 'exclusive'),
  202. )
  203. def t_PPHASH(self, t):
  204. r'[ \t]*\#'
  205. if self.line_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos):
  206. t.lexer.begin('ppline')
  207. self.pp_line = self.pp_filename = None
  208. elif self.pragma_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos):
  209. t.lexer.begin('pppragma')
  210. else:
  211. t.type = 'PPHASH'
  212. return t
  213. ##
  214. ## Rules for the ppline state
  215. ##
  216. @TOKEN(string_literal)
  217. def t_ppline_FILENAME(self, t):
  218. if self.pp_line is None:
  219. self._error('filename before line number in #line', t)
  220. else:
  221. self.pp_filename = t.value.lstrip('"').rstrip('"')
  222. @TOKEN(decimal_constant)
  223. def t_ppline_LINE_NUMBER(self, t):
  224. if self.pp_line is None:
  225. self.pp_line = t.value
  226. else:
  227. # Ignore: GCC's cpp sometimes inserts a numeric flag
  228. # after the file name
  229. pass
  230. def t_ppline_NEWLINE(self, t):
  231. r'\n'
  232. if self.pp_line is None:
  233. self._error('line number missing in #line', t)
  234. else:
  235. self.lexer.lineno = int(self.pp_line)
  236. if self.pp_filename is not None:
  237. self.filename = self.pp_filename
  238. t.lexer.begin('INITIAL')
  239. def t_ppline_PPLINE(self, t):
  240. r'line'
  241. pass
  242. t_ppline_ignore = ' \t'
  243. def t_ppline_error(self, t):
  244. self._error('invalid #line directive', t)
  245. ##
  246. ## Rules for the pppragma state
  247. ##
  248. def t_pppragma_NEWLINE(self, t):
  249. r'\n'
  250. t.lexer.lineno += 1
  251. t.lexer.begin('INITIAL')
  252. def t_pppragma_PPPRAGMA(self, t):
  253. r'pragma'
  254. return t
  255. t_pppragma_ignore = ' \t'
  256. def t_pppragma_STR(self, t):
  257. '.+'
  258. t.type = 'PPPRAGMASTR'
  259. return t
  260. def t_pppragma_error(self, t):
  261. self._error('invalid #pragma directive', t)
  262. ##
  263. ## Rules for the normal state
  264. ##
  265. t_ignore = ' \t'
  266. # Newlines
  267. def t_NEWLINE(self, t):
  268. r'\n+'
  269. t.lexer.lineno += t.value.count("\n")
  270. # Operators
  271. t_PLUS = r'\+'
  272. t_MINUS = r'-'
  273. t_TIMES = r'\*'
  274. t_DIVIDE = r'/'
  275. t_MOD = r'%'
  276. t_OR = r'\|'
  277. t_AND = r'&'
  278. t_NOT = r'~'
  279. t_XOR = r'\^'
  280. t_LSHIFT = r'<<'
  281. t_RSHIFT = r'>>'
  282. t_LOR = r'\|\|'
  283. t_LAND = r'&&'
  284. t_LNOT = r'!'
  285. t_LT = r'<'
  286. t_GT = r'>'
  287. t_LE = r'<='
  288. t_GE = r'>='
  289. t_EQ = r'=='
  290. t_NE = r'!='
  291. # Assignment operators
  292. t_EQUALS = r'='
  293. t_TIMESEQUAL = r'\*='
  294. t_DIVEQUAL = r'/='
  295. t_MODEQUAL = r'%='
  296. t_PLUSEQUAL = r'\+='
  297. t_MINUSEQUAL = r'-='
  298. t_LSHIFTEQUAL = r'<<='
  299. t_RSHIFTEQUAL = r'>>='
  300. t_ANDEQUAL = r'&='
  301. t_OREQUAL = r'\|='
  302. t_XOREQUAL = r'\^='
  303. # Increment/decrement
  304. t_PLUSPLUS = r'\+\+'
  305. t_MINUSMINUS = r'--'
  306. # ->
  307. t_ARROW = r'->'
  308. # ?
  309. t_CONDOP = r'\?'
  310. # Delimeters
  311. t_LPAREN = r'\('
  312. t_RPAREN = r'\)'
  313. t_LBRACKET = r'\['
  314. t_RBRACKET = r'\]'
  315. t_COMMA = r','
  316. t_PERIOD = r'\.'
  317. t_SEMI = r';'
  318. t_COLON = r':'
  319. t_ELLIPSIS = r'\.\.\.'
  320. # Scope delimiters
  321. # To see why on_lbrace_func is needed, consider:
  322. # typedef char TT;
  323. # void foo(int TT) { TT = 10; }
  324. # TT x = 5;
  325. # Outside the function, TT is a typedef, but inside (starting and ending
  326. # with the braces) it's a parameter. The trouble begins with yacc's
  327. # lookahead token. If we open a new scope in brace_open, then TT has
  328. # already been read and incorrectly interpreted as TYPEID. So, we need
  329. # to open and close scopes from within the lexer.
  330. # Similar for the TT immediately outside the end of the function.
  331. #
  332. @TOKEN(r'\{')
  333. def t_LBRACE(self, t):
  334. self.on_lbrace_func()
  335. return t
  336. @TOKEN(r'\}')
  337. def t_RBRACE(self, t):
  338. self.on_rbrace_func()
  339. return t
  340. t_STRING_LITERAL = string_literal
  341. # The following floating and integer constants are defined as
  342. # functions to impose a strict order (otherwise, decimal
  343. # is placed before the others because its regex is longer,
  344. # and this is bad)
  345. #
  346. @TOKEN(floating_constant)
  347. def t_FLOAT_CONST(self, t):
  348. return t
  349. @TOKEN(hex_floating_constant)
  350. def t_HEX_FLOAT_CONST(self, t):
  351. return t
  352. @TOKEN(hex_constant)
  353. def t_INT_CONST_HEX(self, t):
  354. return t
  355. @TOKEN(bin_constant)
  356. def t_INT_CONST_BIN(self, t):
  357. return t
  358. @TOKEN(bad_octal_constant)
  359. def t_BAD_CONST_OCT(self, t):
  360. msg = "Invalid octal constant"
  361. self._error(msg, t)
  362. @TOKEN(octal_constant)
  363. def t_INT_CONST_OCT(self, t):
  364. return t
  365. @TOKEN(decimal_constant)
  366. def t_INT_CONST_DEC(self, t):
  367. return t
  368. # Must come before bad_char_const, to prevent it from
  369. # catching valid char constants as invalid
  370. #
  371. @TOKEN(char_const)
  372. def t_CHAR_CONST(self, t):
  373. return t
  374. @TOKEN(wchar_const)
  375. def t_WCHAR_CONST(self, t):
  376. return t
  377. @TOKEN(unmatched_quote)
  378. def t_UNMATCHED_QUOTE(self, t):
  379. msg = "Unmatched '"
  380. self._error(msg, t)
  381. @TOKEN(bad_char_const)
  382. def t_BAD_CHAR_CONST(self, t):
  383. msg = "Invalid char constant %s" % t.value
  384. self._error(msg, t)
  385. @TOKEN(wstring_literal)
  386. def t_WSTRING_LITERAL(self, t):
  387. return t
  388. # unmatched string literals are caught by the preprocessor
  389. @TOKEN(bad_string_literal)
  390. def t_BAD_STRING_LITERAL(self, t):
  391. msg = "String contains invalid escape code"
  392. self._error(msg, t)
  393. @TOKEN(identifier)
  394. def t_ID(self, t):
  395. t.type = self.keyword_map.get(t.value, "ID")
  396. if t.type == 'ID' and self.type_lookup_func(t.value):
  397. t.type = "TYPEID"
  398. return t
  399. def t_error(self, t):
  400. msg = 'Illegal character %s' % repr(t.value[0])
  401. self._error(msg, t)