Non puoi selezionare più di 25 argomenti Gli argomenti devono iniziare con una lettera o un numero, possono includere trattini ('-') e possono essere lunghi fino a 35 caratteri.
 
 
 
 

1110 righe
40 KiB

  1. """
  2. pygments.lexers.sql
  3. ~~~~~~~~~~~~~~~~~~~
  4. Lexers for various SQL dialects and related interactive sessions.
  5. Postgres specific lexers:
  6. `PostgresLexer`
  7. A SQL lexer for the PostgreSQL dialect. Differences w.r.t. the SQL
  8. lexer are:
  9. - keywords and data types list parsed from the PG docs (run the
  10. `_postgres_builtins` module to update them);
  11. - Content of $-strings parsed using a specific lexer, e.g. the content
  12. of a PL/Python function is parsed using the Python lexer;
  13. - parse PG specific constructs: E-strings, $-strings, U&-strings,
  14. different operators and punctuation.
  15. `PlPgsqlLexer`
  16. A lexer for the PL/pgSQL language. Adds a few specific construct on
  17. top of the PG SQL lexer (such as <<label>>).
  18. `PostgresConsoleLexer`
  19. A lexer to highlight an interactive psql session:
  20. - identifies the prompt and does its best to detect the end of command
  21. in multiline statement where not all the lines are prefixed by a
  22. prompt, telling them apart from the output;
  23. - highlights errors in the output and notification levels;
  24. - handles psql backslash commands.
  25. `PostgresExplainLexer`
  26. A lexer to highlight Postgres execution plan.
  27. The ``tests/examplefiles`` contains a few test files with data to be
  28. parsed by these lexers.
  29. :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
  30. :license: BSD, see LICENSE for details.
  31. """
  32. import collections
  33. import re
  34. from pygments.lexer import Lexer, RegexLexer, do_insertions, bygroups, words
  35. from pygments.lexers import _googlesql_builtins
  36. from pygments.lexers import _mysql_builtins
  37. from pygments.lexers import _postgres_builtins
  38. from pygments.lexers import _sql_builtins
  39. from pygments.lexers import _tsql_builtins
  40. from pygments.lexers import get_lexer_by_name, ClassNotFound
  41. from pygments.token import Punctuation, Whitespace, Text, Comment, Operator, \
  42. Keyword, Name, String, Number, Generic, Literal
  43. __all__ = ['GoogleSqlLexer', 'PostgresLexer', 'PlPgsqlLexer',
  44. 'PostgresConsoleLexer', 'PostgresExplainLexer', 'SqlLexer',
  45. 'TransactSqlLexer', 'MySqlLexer', 'SqliteConsoleLexer', 'RqlLexer']
  46. line_re = re.compile('.*?\n')
  47. sqlite_prompt_re = re.compile(r'^(?:sqlite| ...)>(?= )')
  48. language_re = re.compile(r"\s+LANGUAGE\s+'?(\w+)'?", re.IGNORECASE)
  49. do_re = re.compile(r'\bDO\b', re.IGNORECASE)
  50. # Regular expressions for analyse_text()
  51. name_between_bracket_re = re.compile(r'\[[a-zA-Z_]\w*\]')
  52. name_between_backtick_re = re.compile(r'`[a-zA-Z_]\w*`')
  53. tsql_go_re = re.compile(r'\bgo\b', re.IGNORECASE)
  54. tsql_declare_re = re.compile(r'\bdeclare\s+@', re.IGNORECASE)
  55. tsql_variable_re = re.compile(r'@[a-zA-Z_]\w*\b')
  56. # Identifiers for analyse_text()
  57. googlesql_identifiers = (
  58. _googlesql_builtins.functionnames
  59. + _googlesql_builtins.keywords
  60. + _googlesql_builtins.types)
  61. def language_callback(lexer, match):
  62. """Parse the content of a $-string using a lexer
  63. The lexer is chosen looking for a nearby LANGUAGE or assumed as
  64. plpgsql if inside a DO statement and no LANGUAGE has been found.
  65. """
  66. lx = None
  67. m = language_re.match(lexer.text[match.end():match.end()+100])
  68. if m is not None:
  69. lx = lexer._get_lexer(m.group(1))
  70. else:
  71. m = list(language_re.finditer(
  72. lexer.text[max(0, match.start()-100):match.start()]))
  73. if m:
  74. lx = lexer._get_lexer(m[-1].group(1))
  75. else:
  76. m = list(do_re.finditer(
  77. lexer.text[max(0, match.start()-25):match.start()]))
  78. if m:
  79. lx = lexer._get_lexer('plpgsql')
  80. # 1 = $, 2 = delimiter, 3 = $
  81. yield (match.start(1), String, match.group(1))
  82. yield (match.start(2), String.Delimiter, match.group(2))
  83. yield (match.start(3), String, match.group(3))
  84. # 4 = string contents
  85. if lx:
  86. yield from lx.get_tokens_unprocessed(match.group(4))
  87. else:
  88. yield (match.start(4), String, match.group(4))
  89. # 5 = $, 6 = delimiter, 7 = $
  90. yield (match.start(5), String, match.group(5))
  91. yield (match.start(6), String.Delimiter, match.group(6))
  92. yield (match.start(7), String, match.group(7))
  93. class PostgresBase:
  94. """Base class for Postgres-related lexers.
  95. This is implemented as a mixin to avoid the Lexer metaclass kicking in.
  96. this way the different lexer don't have a common Lexer ancestor. If they
  97. had, _tokens could be created on this ancestor and not updated for the
  98. other classes, resulting e.g. in PL/pgSQL parsed as SQL. This shortcoming
  99. seem to suggest that regexp lexers are not really subclassable.
  100. """
  101. def get_tokens_unprocessed(self, text, *args):
  102. # Have a copy of the entire text to be used by `language_callback`.
  103. self.text = text
  104. yield from super().get_tokens_unprocessed(text, *args)
  105. def _get_lexer(self, lang):
  106. if lang.lower() == 'sql':
  107. return get_lexer_by_name('postgresql', **self.options)
  108. tries = [lang]
  109. if lang.startswith('pl'):
  110. tries.append(lang[2:])
  111. if lang.endswith('u'):
  112. tries.append(lang[:-1])
  113. if lang.startswith('pl') and lang.endswith('u'):
  114. tries.append(lang[2:-1])
  115. for lx in tries:
  116. try:
  117. return get_lexer_by_name(lx, **self.options)
  118. except ClassNotFound:
  119. pass
  120. else:
  121. # TODO: better logging
  122. # print >>sys.stderr, "language not found:", lang
  123. return None
  124. class PostgresLexer(PostgresBase, RegexLexer):
  125. """
  126. Lexer for the PostgreSQL dialect of SQL.
  127. """
  128. name = 'PostgreSQL SQL dialect'
  129. aliases = ['postgresql', 'postgres']
  130. mimetypes = ['text/x-postgresql']
  131. url = 'https://www.postgresql.org'
  132. version_added = '1.5'
  133. flags = re.IGNORECASE
  134. tokens = {
  135. 'root': [
  136. (r'\s+', Whitespace),
  137. (r'--.*\n?', Comment.Single),
  138. (r'/\*', Comment.Multiline, 'multiline-comments'),
  139. (r'(' + '|'.join(s.replace(" ", r"\s+")
  140. for s in _postgres_builtins.DATATYPES +
  141. _postgres_builtins.PSEUDO_TYPES) + r')\b',
  142. Name.Builtin),
  143. (words(_postgres_builtins.KEYWORDS, suffix=r'\b'), Keyword),
  144. (r'[+*/<>=~!@#%^&|`?-]+', Operator),
  145. (r'::', Operator), # cast
  146. (r'\$\d+', Name.Variable),
  147. (r'([0-9]*\.[0-9]*|[0-9]+)(e[+-]?[0-9]+)?', Number.Float),
  148. (r'[0-9]+', Number.Integer),
  149. (r"((?:E|U&)?)(')", bygroups(String.Affix, String.Single), 'string'),
  150. # quoted identifier
  151. (r'((?:U&)?)(")', bygroups(String.Affix, String.Name), 'quoted-ident'),
  152. (r'(?s)(\$)([^$]*)(\$)(.*?)(\$)(\2)(\$)', language_callback),
  153. (r'[a-z_]\w*', Name),
  154. # psql variable in SQL
  155. (r""":(['"]?)[a-z]\w*\b\1""", Name.Variable),
  156. (r'[;:()\[\]{},.]', Punctuation),
  157. ],
  158. 'multiline-comments': [
  159. (r'/\*', Comment.Multiline, 'multiline-comments'),
  160. (r'\*/', Comment.Multiline, '#pop'),
  161. (r'[^/*]+', Comment.Multiline),
  162. (r'[/*]', Comment.Multiline)
  163. ],
  164. 'string': [
  165. (r"[^']+", String.Single),
  166. (r"''", String.Single),
  167. (r"'", String.Single, '#pop'),
  168. ],
  169. 'quoted-ident': [
  170. (r'[^"]+', String.Name),
  171. (r'""', String.Name),
  172. (r'"', String.Name, '#pop'),
  173. ],
  174. }
  175. class PlPgsqlLexer(PostgresBase, RegexLexer):
  176. """
  177. Handle the extra syntax in Pl/pgSQL language.
  178. """
  179. name = 'PL/pgSQL'
  180. aliases = ['plpgsql']
  181. mimetypes = ['text/x-plpgsql']
  182. url = 'https://www.postgresql.org/docs/current/plpgsql.html'
  183. version_added = '1.5'
  184. flags = re.IGNORECASE
  185. # FIXME: use inheritance
  186. tokens = {name: state[:] for (name, state) in PostgresLexer.tokens.items()}
  187. # extend the keywords list
  188. for i, pattern in enumerate(tokens['root']):
  189. if pattern[1] == Keyword:
  190. tokens['root'][i] = (
  191. words(_postgres_builtins.KEYWORDS +
  192. _postgres_builtins.PLPGSQL_KEYWORDS, suffix=r'\b'),
  193. Keyword)
  194. del i
  195. break
  196. else:
  197. assert 0, "SQL keywords not found"
  198. # Add specific PL/pgSQL rules (before the SQL ones)
  199. tokens['root'][:0] = [
  200. (r'\%[a-z]\w*\b', Name.Builtin), # actually, a datatype
  201. (r':=', Operator),
  202. (r'\<\<[a-z]\w*\>\>', Name.Label),
  203. (r'\#[a-z]\w*\b', Keyword.Pseudo), # #variable_conflict
  204. ]
  205. class PsqlRegexLexer(PostgresBase, RegexLexer):
  206. """
  207. Extend the PostgresLexer adding support specific for psql commands.
  208. This is not a complete psql lexer yet as it lacks prompt support
  209. and output rendering.
  210. """
  211. name = 'PostgreSQL console - regexp based lexer'
  212. aliases = [] # not public
  213. flags = re.IGNORECASE
  214. tokens = {name: state[:] for (name, state) in PostgresLexer.tokens.items()}
  215. tokens['root'].append(
  216. (r'\\[^\s]+', Keyword.Pseudo, 'psql-command'))
  217. tokens['psql-command'] = [
  218. (r'\n', Text, 'root'),
  219. (r'\s+', Whitespace),
  220. (r'\\[^\s]+', Keyword.Pseudo),
  221. (r""":(['"]?)[a-z]\w*\b\1""", Name.Variable),
  222. (r"'(''|[^'])*'", String.Single),
  223. (r"`([^`])*`", String.Backtick),
  224. (r"[^\s]+", String.Symbol),
  225. ]
  226. re_prompt = re.compile(r'^(\S.*?)??[=\-\(\$\'\"][#>]')
  227. re_psql_command = re.compile(r'\s*\\')
  228. re_end_command = re.compile(r';\s*(--.*?)?$')
  229. re_psql_command = re.compile(r'(\s*)(\\.+?)(\s+)$')
  230. re_error = re.compile(r'(ERROR|FATAL):')
  231. re_message = re.compile(
  232. r'((?:DEBUG|INFO|NOTICE|WARNING|ERROR|'
  233. r'FATAL|HINT|DETAIL|CONTEXT|LINE [0-9]+):)(.*?\n)')
  234. class lookahead:
  235. """Wrap an iterator and allow pushing back an item."""
  236. def __init__(self, x):
  237. self.iter = iter(x)
  238. self._nextitem = None
  239. def __iter__(self):
  240. return self
  241. def send(self, i):
  242. self._nextitem = i
  243. return i
  244. def __next__(self):
  245. if self._nextitem is not None:
  246. ni = self._nextitem
  247. self._nextitem = None
  248. return ni
  249. return next(self.iter)
  250. next = __next__
  251. class PostgresConsoleLexer(Lexer):
  252. """
  253. Lexer for psql sessions.
  254. """
  255. name = 'PostgreSQL console (psql)'
  256. aliases = ['psql', 'postgresql-console', 'postgres-console']
  257. mimetypes = ['text/x-postgresql-psql']
  258. url = 'https://www.postgresql.org'
  259. version_added = '1.5'
  260. _example = "psql/psql_session.txt"
  261. def get_tokens_unprocessed(self, data):
  262. sql = PsqlRegexLexer(**self.options)
  263. lines = lookahead(line_re.findall(data))
  264. # prompt-output cycle
  265. while 1:
  266. # consume the lines of the command: start with an optional prompt
  267. # and continue until the end of command is detected
  268. curcode = ''
  269. insertions = []
  270. for line in lines:
  271. # Identify a shell prompt in case of psql commandline example
  272. if line.startswith('$') and not curcode:
  273. lexer = get_lexer_by_name('console', **self.options)
  274. yield from lexer.get_tokens_unprocessed(line)
  275. break
  276. # Identify a psql prompt
  277. mprompt = re_prompt.match(line)
  278. if mprompt is not None:
  279. insertions.append((len(curcode),
  280. [(0, Generic.Prompt, mprompt.group())]))
  281. curcode += line[len(mprompt.group()):]
  282. else:
  283. curcode += line
  284. # Check if this is the end of the command
  285. # TODO: better handle multiline comments at the end with
  286. # a lexer with an external state?
  287. if re_psql_command.match(curcode) \
  288. or re_end_command.search(curcode):
  289. break
  290. # Emit the combined stream of command and prompt(s)
  291. yield from do_insertions(insertions,
  292. sql.get_tokens_unprocessed(curcode))
  293. # Emit the output lines
  294. out_token = Generic.Output
  295. for line in lines:
  296. mprompt = re_prompt.match(line)
  297. if mprompt is not None:
  298. # push the line back to have it processed by the prompt
  299. lines.send(line)
  300. break
  301. mmsg = re_message.match(line)
  302. if mmsg is not None:
  303. if mmsg.group(1).startswith("ERROR") \
  304. or mmsg.group(1).startswith("FATAL"):
  305. out_token = Generic.Error
  306. yield (mmsg.start(1), Generic.Strong, mmsg.group(1))
  307. yield (mmsg.start(2), out_token, mmsg.group(2))
  308. else:
  309. yield (0, out_token, line)
  310. else:
  311. return
  312. class PostgresExplainLexer(RegexLexer):
  313. """
  314. Handle PostgreSQL EXPLAIN output
  315. """
  316. name = 'PostgreSQL EXPLAIN dialect'
  317. aliases = ['postgres-explain']
  318. filenames = ['*.explain']
  319. mimetypes = ['text/x-postgresql-explain']
  320. url = 'https://www.postgresql.org/docs/current/using-explain.html'
  321. version_added = '2.15'
  322. tokens = {
  323. 'root': [
  324. (r'(:|\(|\)|ms|kB|->|\.\.|\,|\/)', Punctuation),
  325. (r'(\s+)', Whitespace),
  326. # This match estimated cost and effectively measured counters with ANALYZE
  327. # Then, we move to instrumentation state
  328. (r'(cost)(=?)', bygroups(Name.Class, Punctuation), 'instrumentation'),
  329. (r'(actual)( )(=?)', bygroups(Name.Class, Whitespace, Punctuation), 'instrumentation'),
  330. # Misc keywords
  331. (words(('actual', 'Memory Usage', 'Disk Usage', 'Memory', 'Buckets', 'Batches',
  332. 'originally', 'row', 'rows', 'Hits', 'Misses',
  333. 'Evictions', 'Overflows', 'Planned Partitions'), suffix=r'\b'),
  334. Comment.Single),
  335. (r'(hit|read|dirtied|written|write|time|calls)(=)', bygroups(Comment.Single, Operator)),
  336. (r'(shared|temp|local)', Keyword.Pseudo),
  337. # We move to sort state in order to emphasize specific keywords (especially disk access)
  338. (r'(Sort Method)(: )', bygroups(Comment.Preproc, Punctuation), 'sort'),
  339. # These keywords can be followed by an object, like a table
  340. (r'(Sort Key|Group Key|Presorted Key|Hash Key)(:)( )',
  341. bygroups(Comment.Preproc, Punctuation, Whitespace), 'object_name'),
  342. (r'(Cache Key|Cache Mode)(:)( )', bygroups(Comment, Punctuation, Whitespace), 'object_name'),
  343. # These keywords can be followed by a predicate
  344. (words(('Join Filter', 'Subplans Removed', 'Filter', 'Merge Cond',
  345. 'Hash Cond', 'Index Cond', 'Recheck Cond', 'Heap Blocks',
  346. 'TID Cond', 'Run Condition', 'Order By', 'Function Call',
  347. 'Table Function Call', 'Inner Unique', 'Params Evaluated',
  348. 'Single Copy', 'Sampling', 'One-Time Filter', 'Output',
  349. 'Relations', 'Remote SQL'), suffix=r'\b'),
  350. Comment.Preproc, 'predicate'),
  351. # Special keyword to handle ON CONFLICT
  352. (r'Conflict ', Comment.Preproc, 'conflict'),
  353. # Special keyword for InitPlan or SubPlan
  354. (r'(InitPlan|SubPlan)( )(\d+)( )',
  355. bygroups(Keyword, Whitespace, Number.Integer, Whitespace),
  356. 'init_plan'),
  357. (words(('Sort Method', 'Join Filter', 'Planning time',
  358. 'Planning Time', 'Execution time', 'Execution Time',
  359. 'Workers Planned', 'Workers Launched', 'Buffers',
  360. 'Planning', 'Worker', 'Query Identifier', 'Time',
  361. 'Full-sort Groups', 'Pre-sorted Groups'), suffix=r'\b'), Comment.Preproc),
  362. # Emphasize these keywords
  363. (words(('Rows Removed by Join Filter', 'Rows Removed by Filter',
  364. 'Rows Removed by Index Recheck',
  365. 'Heap Fetches', 'never executed'),
  366. suffix=r'\b'), Name.Exception),
  367. (r'(I/O Timings)(:)( )', bygroups(Name.Exception, Punctuation, Whitespace)),
  368. (words(_postgres_builtins.EXPLAIN_KEYWORDS, suffix=r'\b'), Keyword),
  369. # join keywords
  370. (r'((Right|Left|Full|Semi|Anti) Join)', Keyword.Type),
  371. (r'(Parallel |Async |Finalize |Partial )', Comment.Preproc),
  372. (r'Backward', Comment.Preproc),
  373. (r'(Intersect|Except|Hash)', Comment.Preproc),
  374. (r'(CTE)( )(\w*)?', bygroups(Comment, Whitespace, Name.Variable)),
  375. # Treat "on" and "using" as a punctuation
  376. (r'(on|using)', Punctuation, 'object_name'),
  377. # strings
  378. (r"'(''|[^'])*'", String.Single),
  379. # numbers
  380. (r'-?\d+\.\d+', Number.Float),
  381. (r'(-?\d+)', Number.Integer),
  382. # boolean
  383. (r'(true|false)', Name.Constant),
  384. # explain header
  385. (r'\s*QUERY PLAN\s*\n\s*-+', Comment.Single),
  386. # Settings
  387. (r'(Settings)(:)( )', bygroups(Comment.Preproc, Punctuation, Whitespace), 'setting'),
  388. # Handle JIT counters
  389. (r'(JIT|Functions|Options|Timing)(:)', bygroups(Comment.Preproc, Punctuation)),
  390. (r'(Inlining|Optimization|Expressions|Deforming|Generation|Emission|Total)', Keyword.Pseudo),
  391. # Handle Triggers counters
  392. (r'(Trigger)( )(\S*)(:)( )',
  393. bygroups(Comment.Preproc, Whitespace, Name.Variable, Punctuation, Whitespace)),
  394. ],
  395. 'expression': [
  396. # matches any kind of parenthesized expression
  397. # the first opening paren is matched by the 'caller'
  398. (r'\(', Punctuation, '#push'),
  399. (r'\)', Punctuation, '#pop'),
  400. (r'(never executed)', Name.Exception),
  401. (r'[^)(]+', Comment),
  402. ],
  403. 'object_name': [
  404. # This is a cost or analyze measure
  405. (r'(\(cost)(=?)', bygroups(Name.Class, Punctuation), 'instrumentation'),
  406. (r'(\(actual)( )(=?)', bygroups(Name.Class, Whitespace, Punctuation), 'instrumentation'),
  407. # if object_name is parenthesized, mark opening paren as
  408. # punctuation, call 'expression', and exit state
  409. (r'\(', Punctuation, 'expression'),
  410. (r'(on)', Punctuation),
  411. # matches possibly schema-qualified table and column names
  412. (r'\w+(\.\w+)*( USING \S+| \w+ USING \S+)', Name.Variable),
  413. (r'\"?\w+\"?(?:\.\"?\w+\"?)?', Name.Variable),
  414. (r'\'\S*\'', Name.Variable),
  415. # if we encounter a comma, another object is listed
  416. (r',\n', Punctuation, 'object_name'),
  417. (r',', Punctuation, 'object_name'),
  418. # special case: "*SELECT*"
  419. (r'"\*SELECT\*( \d+)?"(.\w+)?', Name.Variable),
  420. (r'"\*VALUES\*(_\d+)?"(.\w+)?', Name.Variable),
  421. (r'"ANY_subquery"', Name.Variable),
  422. # Variable $1 ...
  423. (r'\$\d+', Name.Variable),
  424. # cast
  425. (r'::\w+', Name.Variable),
  426. (r' +', Whitespace),
  427. (r'"', Punctuation),
  428. (r'\[\.\.\.\]', Punctuation),
  429. (r'\)', Punctuation, '#pop'),
  430. ],
  431. 'predicate': [
  432. # if predicate is parenthesized, mark paren as punctuation
  433. (r'(\()([^\n]*)(\))', bygroups(Punctuation, Name.Variable, Punctuation), '#pop'),
  434. # otherwise color until newline
  435. (r'[^\n]*', Name.Variable, '#pop'),
  436. ],
  437. 'instrumentation': [
  438. (r'=|\.\.', Punctuation),
  439. (r' +', Whitespace),
  440. (r'(rows|width|time|loops)', Name.Class),
  441. (r'\d+\.\d+', Number.Float),
  442. (r'(\d+)', Number.Integer),
  443. (r'\)', Punctuation, '#pop'),
  444. ],
  445. 'conflict': [
  446. (r'(Resolution: )(\w+)', bygroups(Comment.Preproc, Name.Variable)),
  447. (r'(Arbiter \w+:)', Comment.Preproc, 'object_name'),
  448. (r'(Filter: )', Comment.Preproc, 'predicate'),
  449. ],
  450. 'setting': [
  451. (r'([a-z_]*?)(\s*)(=)(\s*)(\'.*?\')', bygroups(Name.Attribute, Whitespace, Operator, Whitespace, String)),
  452. (r'\, ', Punctuation),
  453. ],
  454. 'init_plan': [
  455. (r'\(', Punctuation),
  456. (r'returns \$\d+(,\$\d+)?', Name.Variable),
  457. (r'\)', Punctuation, '#pop'),
  458. ],
  459. 'sort': [
  460. (r':|kB', Punctuation),
  461. (r'(quicksort|top-N|heapsort|Average|Memory|Peak)', Comment.Prepoc),
  462. (r'(external|merge|Disk|sort)', Name.Exception),
  463. (r'(\d+)', Number.Integer),
  464. (r' +', Whitespace),
  465. ],
  466. }
  467. class SqlLexer(RegexLexer):
  468. """
  469. Lexer for Structured Query Language. Currently, this lexer does
  470. not recognize any special syntax except ANSI SQL.
  471. """
  472. name = 'SQL'
  473. aliases = ['sql']
  474. filenames = ['*.sql']
  475. mimetypes = ['text/x-sql']
  476. url = 'https://en.wikipedia.org/wiki/SQL'
  477. version_added = ''
  478. flags = re.IGNORECASE
  479. tokens = {
  480. 'root': [
  481. (r'\s+', Whitespace),
  482. (r'--.*\n?', Comment.Single),
  483. (r'/\*', Comment.Multiline, 'multiline-comments'),
  484. (words(_sql_builtins.KEYWORDS, suffix=r'\b'), Keyword),
  485. (words(_sql_builtins.DATATYPES, suffix=r'\b'), Name.Builtin),
  486. (r'[+*/<>=~!@#%^&|`?-]', Operator),
  487. (r'[0-9]+', Number.Integer),
  488. # TODO: Backslash escapes?
  489. (r"'(''|[^'])*'", String.Single),
  490. (r'"(""|[^"])*"', String.Symbol), # not a real string literal in ANSI SQL
  491. (r'[a-z_][\w$]*', Name), # allow $s in strings for Oracle
  492. (r'[;:()\[\],.]', Punctuation)
  493. ],
  494. 'multiline-comments': [
  495. (r'/\*', Comment.Multiline, 'multiline-comments'),
  496. (r'\*/', Comment.Multiline, '#pop'),
  497. (r'[^/*]+', Comment.Multiline),
  498. (r'[/*]', Comment.Multiline)
  499. ]
  500. }
  501. def analyse_text(self, text):
  502. return
  503. class TransactSqlLexer(RegexLexer):
  504. """
  505. Transact-SQL (T-SQL) is Microsoft's and Sybase's proprietary extension to
  506. SQL.
  507. The list of keywords includes ODBC and keywords reserved for future use.
  508. """
  509. name = 'Transact-SQL'
  510. aliases = ['tsql', 't-sql']
  511. filenames = ['*.sql']
  512. mimetypes = ['text/x-tsql']
  513. url = 'https://www.tsql.info'
  514. version_added = ''
  515. flags = re.IGNORECASE
  516. tokens = {
  517. 'root': [
  518. (r'\s+', Whitespace),
  519. (r'--.*[$|\n]?', Comment.Single),
  520. (r'/\*', Comment.Multiline, 'multiline-comments'),
  521. (words(_tsql_builtins.OPERATORS), Operator),
  522. (words(_tsql_builtins.OPERATOR_WORDS, suffix=r'\b'), Operator.Word),
  523. (words(_tsql_builtins.TYPES, suffix=r'\b'), Name.Class),
  524. (words(_tsql_builtins.FUNCTIONS, suffix=r'\b'), Name.Function),
  525. (r'(goto)(\s+)(\w+\b)', bygroups(Keyword, Whitespace, Name.Label)),
  526. (words(_tsql_builtins.KEYWORDS, suffix=r'\b'), Keyword),
  527. (r'(\[)([^]]+)(\])', bygroups(Operator, Name, Operator)),
  528. (r'0x[0-9a-f]+', Number.Hex),
  529. # Float variant 1, for example: 1., 1.e2, 1.2e3
  530. (r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float),
  531. # Float variant 2, for example: .1, .1e2
  532. (r'\.[0-9]+(e[+-]?[0-9]+)?', Number.Float),
  533. # Float variant 3, for example: 123e45
  534. (r'[0-9]+e[+-]?[0-9]+', Number.Float),
  535. (r'[0-9]+', Number.Integer),
  536. (r"'(''|[^'])*'", String.Single),
  537. (r'"(""|[^"])*"', String.Symbol),
  538. (r'[;(),.]', Punctuation),
  539. # Below we use \w even for the first "real" character because
  540. # tokens starting with a digit have already been recognized
  541. # as Number above.
  542. (r'@@\w+', Name.Builtin),
  543. (r'@\w+', Name.Variable),
  544. (r'(\w+)(:)', bygroups(Name.Label, Punctuation)),
  545. (r'#?#?\w+', Name), # names for temp tables and anything else
  546. (r'\?', Name.Variable.Magic), # parameter for prepared statements
  547. ],
  548. 'multiline-comments': [
  549. (r'/\*', Comment.Multiline, 'multiline-comments'),
  550. (r'\*/', Comment.Multiline, '#pop'),
  551. (r'[^/*]+', Comment.Multiline),
  552. (r'[/*]', Comment.Multiline)
  553. ]
  554. }
  555. def analyse_text(text):
  556. rating = 0
  557. if tsql_declare_re.search(text):
  558. # Found T-SQL variable declaration.
  559. rating = 1.0
  560. else:
  561. name_between_backtick_count = len(
  562. name_between_backtick_re.findall(text))
  563. name_between_bracket_count = len(
  564. name_between_bracket_re.findall(text))
  565. # We need to check if there are any names using
  566. # backticks or brackets, as otherwise both are 0
  567. # and 0 >= 2 * 0, so we would always assume it's true
  568. dialect_name_count = name_between_backtick_count + name_between_bracket_count
  569. if dialect_name_count >= 1 and \
  570. name_between_bracket_count >= 2 * name_between_backtick_count:
  571. # Found at least twice as many [name] as `name`.
  572. rating += 0.5
  573. elif name_between_bracket_count > name_between_backtick_count:
  574. rating += 0.2
  575. elif name_between_bracket_count > 0:
  576. rating += 0.1
  577. if tsql_variable_re.search(text) is not None:
  578. rating += 0.1
  579. if tsql_go_re.search(text) is not None:
  580. rating += 0.1
  581. return rating
  582. class MySqlLexer(RegexLexer):
  583. """The Oracle MySQL lexer.
  584. This lexer does not attempt to maintain strict compatibility with
  585. MariaDB syntax or keywords. Although MySQL and MariaDB's common code
  586. history suggests there may be significant overlap between the two,
  587. compatibility between the two is not a target for this lexer.
  588. """
  589. name = 'MySQL'
  590. aliases = ['mysql']
  591. mimetypes = ['text/x-mysql']
  592. url = 'https://www.mysql.com'
  593. version_added = ''
  594. flags = re.IGNORECASE
  595. tokens = {
  596. 'root': [
  597. (r'\s+', Whitespace),
  598. # Comments
  599. (r'(?:#|--\s+).*', Comment.Single),
  600. (r'/\*\+', Comment.Special, 'optimizer-hints'),
  601. (r'/\*', Comment.Multiline, 'multiline-comment'),
  602. # Hexadecimal literals
  603. (r"x'([0-9a-f]{2})+'", Number.Hex), # MySQL requires paired hex characters in this form.
  604. (r'0x[0-9a-f]+', Number.Hex),
  605. # Binary literals
  606. (r"b'[01]+'", Number.Bin),
  607. (r'0b[01]+', Number.Bin),
  608. # Numeric literals
  609. (r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float), # Mandatory integer, optional fraction and exponent
  610. (r'[0-9]*\.[0-9]+(e[+-]?[0-9]+)?', Number.Float), # Mandatory fraction, optional integer and exponent
  611. (r'[0-9]+e[+-]?[0-9]+', Number.Float), # Exponents with integer significands are still floats
  612. (r'[0-9]+(?=[^0-9a-z$_\u0080-\uffff])', Number.Integer), # Integers that are not in a schema object name
  613. # Date literals
  614. (r"\{\s*d\s*(?P<quote>['\"])\s*\d{2}(\d{2})?.?\d{2}.?\d{2}\s*(?P=quote)\s*\}",
  615. Literal.Date),
  616. # Time literals
  617. (r"\{\s*t\s*(?P<quote>['\"])\s*(?:\d+\s+)?\d{1,2}.?\d{1,2}.?\d{1,2}(\.\d*)?\s*(?P=quote)\s*\}",
  618. Literal.Date),
  619. # Timestamp literals
  620. (
  621. r"\{\s*ts\s*(?P<quote>['\"])\s*"
  622. r"\d{2}(?:\d{2})?.?\d{2}.?\d{2}" # Date part
  623. r"\s+" # Whitespace between date and time
  624. r"\d{1,2}.?\d{1,2}.?\d{1,2}(\.\d*)?" # Time part
  625. r"\s*(?P=quote)\s*\}",
  626. Literal.Date
  627. ),
  628. # String literals
  629. (r"'", String.Single, 'single-quoted-string'),
  630. (r'"', String.Double, 'double-quoted-string'),
  631. # Variables
  632. (r'@@(?:global\.|persist\.|persist_only\.|session\.)?[a-z_]+', Name.Variable),
  633. (r'@[a-z0-9_$.]+', Name.Variable),
  634. (r"@'", Name.Variable, 'single-quoted-variable'),
  635. (r'@"', Name.Variable, 'double-quoted-variable'),
  636. (r"@`", Name.Variable, 'backtick-quoted-variable'),
  637. (r'\?', Name.Variable), # For demonstrating prepared statements
  638. # Operators
  639. (r'[!%&*+/:<=>^|~-]+', Operator),
  640. # Exceptions; these words tokenize differently in different contexts.
  641. (r'\b(set)(?!\s*\()', Keyword),
  642. (r'\b(character)(\s+)(set)\b', bygroups(Keyword, Whitespace, Keyword)),
  643. # In all other known cases, "SET" is tokenized by MYSQL_DATATYPES.
  644. (words(_mysql_builtins.MYSQL_CONSTANTS, prefix=r'\b', suffix=r'\b'),
  645. Name.Constant),
  646. (words(_mysql_builtins.MYSQL_DATATYPES, prefix=r'\b', suffix=r'\b'),
  647. Keyword.Type),
  648. (words(_mysql_builtins.MYSQL_KEYWORDS, prefix=r'\b', suffix=r'\b'),
  649. Keyword),
  650. (words(_mysql_builtins.MYSQL_FUNCTIONS, prefix=r'\b', suffix=r'\b(\s*)(\()'),
  651. bygroups(Name.Function, Whitespace, Punctuation)),
  652. # Schema object names
  653. #
  654. # Note: Although the first regex supports unquoted all-numeric
  655. # identifiers, this will not be a problem in practice because
  656. # numeric literals have already been handled above.
  657. #
  658. ('[0-9a-z$_\u0080-\uffff]+', Name),
  659. (r'`', Name.Quoted, 'schema-object-name'),
  660. # Punctuation
  661. (r'[(),.;]', Punctuation),
  662. ],
  663. # Multiline comment substates
  664. # ---------------------------
  665. 'optimizer-hints': [
  666. (r'[^*a-z]+', Comment.Special),
  667. (r'\*/', Comment.Special, '#pop'),
  668. (words(_mysql_builtins.MYSQL_OPTIMIZER_HINTS, suffix=r'\b'),
  669. Comment.Preproc),
  670. ('[a-z]+', Comment.Special),
  671. (r'\*', Comment.Special),
  672. ],
  673. 'multiline-comment': [
  674. (r'[^*]+', Comment.Multiline),
  675. (r'\*/', Comment.Multiline, '#pop'),
  676. (r'\*', Comment.Multiline),
  677. ],
  678. # String substates
  679. # ----------------
  680. 'single-quoted-string': [
  681. (r"[^'\\]+", String.Single),
  682. (r"''", String.Escape),
  683. (r"""\\[0'"bnrtZ\\%_]""", String.Escape),
  684. (r"'", String.Single, '#pop'),
  685. ],
  686. 'double-quoted-string': [
  687. (r'[^"\\]+', String.Double),
  688. (r'""', String.Escape),
  689. (r"""\\[0'"bnrtZ\\%_]""", String.Escape),
  690. (r'"', String.Double, '#pop'),
  691. ],
  692. # Variable substates
  693. # ------------------
  694. 'single-quoted-variable': [
  695. (r"[^']+", Name.Variable),
  696. (r"''", Name.Variable),
  697. (r"'", Name.Variable, '#pop'),
  698. ],
  699. 'double-quoted-variable': [
  700. (r'[^"]+', Name.Variable),
  701. (r'""', Name.Variable),
  702. (r'"', Name.Variable, '#pop'),
  703. ],
  704. 'backtick-quoted-variable': [
  705. (r'[^`]+', Name.Variable),
  706. (r'``', Name.Variable),
  707. (r'`', Name.Variable, '#pop'),
  708. ],
  709. # Schema object name substates
  710. # ----------------------------
  711. #
  712. # "Name.Quoted" and "Name.Quoted.Escape" are non-standard but
  713. # formatters will style them as "Name" by default but add
  714. # additional styles based on the token name. This gives users
  715. # flexibility to add custom styles as desired.
  716. #
  717. 'schema-object-name': [
  718. (r'[^`]+', Name.Quoted),
  719. (r'``', Name.Quoted.Escape),
  720. (r'`', Name.Quoted, '#pop'),
  721. ],
  722. }
  723. def analyse_text(text):
  724. rating = 0
  725. name_between_backtick_count = len(
  726. name_between_backtick_re.findall(text))
  727. name_between_bracket_count = len(
  728. name_between_bracket_re.findall(text))
  729. # Same logic as above in the TSQL analysis
  730. dialect_name_count = name_between_backtick_count + name_between_bracket_count
  731. if dialect_name_count >= 1 and \
  732. name_between_backtick_count >= 2 * name_between_bracket_count:
  733. # Found at least twice as many `name` as [name].
  734. rating += 0.5
  735. elif name_between_backtick_count > name_between_bracket_count:
  736. rating += 0.2
  737. elif name_between_backtick_count > 0:
  738. rating += 0.1
  739. return rating
  740. class GoogleSqlLexer(RegexLexer):
  741. """
  742. GoogleSQL is Google's standard SQL dialect, formerly known as ZetaSQL.
  743. The list of keywords includes reserved words for future use.
  744. """
  745. name = 'GoogleSQL'
  746. aliases = ['googlesql', 'zetasql']
  747. filenames = ['*.googlesql', '*.googlesql.sql']
  748. mimetypes = ['text/x-google-sql', 'text/x-google-sql-aux']
  749. url = 'https://cloud.google.com/bigquery/googlesql'
  750. version_added = '2.19'
  751. flags = re.IGNORECASE
  752. tokens = {
  753. 'root': [
  754. (r'\s+', Whitespace),
  755. # Comments
  756. (r'(?:#|--\s+).*', Comment.Single),
  757. (r'/\*', Comment.Multiline, 'multiline-comment'),
  758. # Hexadecimal literals
  759. (r"x'([0-9a-f]{2})+'", Number.Hex),
  760. (r'0x[0-9a-f]+', Number.Hex),
  761. # Binary literals
  762. (r"b'[01]+'", Number.Bin),
  763. (r'0b[01]+', Number.Bin),
  764. # Numeric literals
  765. (r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float), # Mandatory integer, optional fraction and exponent
  766. (r'[0-9]*\.[0-9]+(e[+-]?[0-9]+)?', Number.Float), # Mandatory fraction, optional integer and exponent
  767. (r'[0-9]+e[+-]?[0-9]+', Number.Float), # Exponents with integer significands are still floats
  768. (r'[0-9]+(?=[^0-9a-z$_\u0080-\uffff])', Number.Integer), # Integers that are not in a schema object name
  769. # Date literals
  770. (r"\{\s*d\s*(?P<quote>['\"])\s*\d{2}(\d{2})?.?\d{2}.?\d{2}\s*(?P=quote)\s*\}",
  771. Literal.Date),
  772. # Time literals
  773. (r"\{\s*t\s*(?P<quote>['\"])\s*(?:\d+\s+)?\d{1,2}.?\d{1,2}.?\d{1,2}(\.\d*)?\s*(?P=quote)\s*\}",
  774. Literal.Date),
  775. # Timestamp literals
  776. (
  777. r"\{\s*ts\s*(?P<quote>['\"])\s*"
  778. r"\d{2}(?:\d{2})?.?\d{2}.?\d{2}" # Date part
  779. r"\s+" # Whitespace between date and time
  780. r"\d{1,2}.?\d{1,2}.?\d{1,2}(\.\d*)?" # Time part
  781. r"\s*(?P=quote)\s*\}",
  782. Literal.Date
  783. ),
  784. # String literals
  785. (r"'", String.Single, 'single-quoted-string'),
  786. (r'"', String.Double, 'double-quoted-string'),
  787. # Variables
  788. (r'@@(?:global\.|persist\.|persist_only\.|session\.)?[a-z_]+', Name.Variable),
  789. (r'@[a-z0-9_$.]+', Name.Variable),
  790. (r"@'", Name.Variable, 'single-quoted-variable'),
  791. (r'@"', Name.Variable, 'double-quoted-variable'),
  792. (r"@`", Name.Variable, 'backtick-quoted-variable'),
  793. (r'\?', Name.Variable), # For demonstrating prepared statements
  794. # Exceptions; these words tokenize differently in different contexts.
  795. (r'\b(set)(?!\s*\()', Keyword),
  796. (r'\b(character)(\s+)(set)\b', bygroups(Keyword, Whitespace, Keyword)),
  797. # Constants, types, keywords, functions, operators
  798. (words(_googlesql_builtins.constants, prefix=r'\b', suffix=r'\b'), Name.Constant),
  799. (words(_googlesql_builtins.types, prefix=r'\b', suffix=r'\b'), Keyword.Type),
  800. (words(_googlesql_builtins.keywords, prefix=r'\b', suffix=r'\b'), Keyword),
  801. (words(_googlesql_builtins.functionnames, prefix=r'\b', suffix=r'\b(\s*)(\()'),
  802. bygroups(Name.Function, Whitespace, Punctuation)),
  803. (words(_googlesql_builtins.operators, prefix=r'\b', suffix=r'\b'), Operator),
  804. # Schema object names
  805. #
  806. # Note: Although the first regex supports unquoted all-numeric
  807. # identifiers, this will not be a problem in practice because
  808. # numeric literals have already been handled above.
  809. #
  810. ('[0-9a-z$_\u0080-\uffff]+', Name),
  811. (r'`', Name.Quoted, 'schema-object-name'),
  812. # Punctuation
  813. (r'[(),.;]', Punctuation),
  814. ],
  815. # Multiline comment substates
  816. # ---------------------------
  817. 'multiline-comment': [
  818. (r'[^*]+', Comment.Multiline),
  819. (r'\*/', Comment.Multiline, '#pop'),
  820. (r'\*', Comment.Multiline),
  821. ],
  822. # String substates
  823. # ----------------
  824. 'single-quoted-string': [
  825. (r"[^'\\]+", String.Single),
  826. (r"''", String.Escape),
  827. (r"""\\[0'"bnrtZ\\%_]""", String.Escape),
  828. (r"'", String.Single, '#pop'),
  829. ],
  830. 'double-quoted-string': [
  831. (r'[^"\\]+', String.Double),
  832. (r'""', String.Escape),
  833. (r"""\\[0'"bnrtZ\\%_]""", String.Escape),
  834. (r'"', String.Double, '#pop'),
  835. ],
  836. # Variable substates
  837. # ------------------
  838. 'single-quoted-variable': [
  839. (r"[^']+", Name.Variable),
  840. (r"''", Name.Variable),
  841. (r"'", Name.Variable, '#pop'),
  842. ],
  843. 'double-quoted-variable': [
  844. (r'[^"]+', Name.Variable),
  845. (r'""', Name.Variable),
  846. (r'"', Name.Variable, '#pop'),
  847. ],
  848. 'backtick-quoted-variable': [
  849. (r'[^`]+', Name.Variable),
  850. (r'``', Name.Variable),
  851. (r'`', Name.Variable, '#pop'),
  852. ],
  853. # Schema object name substates
  854. # ----------------------------
  855. #
  856. # "Name.Quoted" and "Name.Quoted.Escape" are non-standard but
  857. # formatters will style them as "Name" by default but add
  858. # additional styles based on the token name. This gives users
  859. # flexibility to add custom styles as desired.
  860. #
  861. 'schema-object-name': [
  862. (r'[^`]+', Name.Quoted),
  863. (r'``', Name.Quoted.Escape),
  864. (r'`', Name.Quoted, '#pop'),
  865. ],
  866. }
  867. def analyse_text(text):
  868. tokens = collections.Counter(text.split())
  869. return 0.001 * sum(count for t, count in tokens.items()
  870. if t in googlesql_identifiers)
  871. class SqliteConsoleLexer(Lexer):
  872. """
  873. Lexer for example sessions using sqlite3.
  874. """
  875. name = 'sqlite3con'
  876. aliases = ['sqlite3']
  877. filenames = ['*.sqlite3-console']
  878. mimetypes = ['text/x-sqlite3-console']
  879. url = 'https://www.sqlite.org'
  880. version_added = '0.11'
  881. _example = "sqlite3/sqlite3.sqlite3-console"
  882. def get_tokens_unprocessed(self, data):
  883. sql = SqlLexer(**self.options)
  884. curcode = ''
  885. insertions = []
  886. for match in line_re.finditer(data):
  887. line = match.group()
  888. prompt_match = sqlite_prompt_re.match(line)
  889. if prompt_match is not None:
  890. insertions.append((len(curcode),
  891. [(0, Generic.Prompt, line[:7])]))
  892. insertions.append((len(curcode),
  893. [(7, Whitespace, ' ')]))
  894. curcode += line[8:]
  895. else:
  896. if curcode:
  897. yield from do_insertions(insertions,
  898. sql.get_tokens_unprocessed(curcode))
  899. curcode = ''
  900. insertions = []
  901. if line.startswith('SQL error: '):
  902. yield (match.start(), Generic.Traceback, line)
  903. else:
  904. yield (match.start(), Generic.Output, line)
  905. if curcode:
  906. yield from do_insertions(insertions,
  907. sql.get_tokens_unprocessed(curcode))
  908. class RqlLexer(RegexLexer):
  909. """
  910. Lexer for Relation Query Language.
  911. """
  912. name = 'RQL'
  913. url = 'http://www.logilab.org/project/rql'
  914. aliases = ['rql']
  915. filenames = ['*.rql']
  916. mimetypes = ['text/x-rql']
  917. version_added = '2.0'
  918. flags = re.IGNORECASE
  919. tokens = {
  920. 'root': [
  921. (r'\s+', Whitespace),
  922. (r'(DELETE|SET|INSERT|UNION|DISTINCT|WITH|WHERE|BEING|OR'
  923. r'|AND|NOT|GROUPBY|HAVING|ORDERBY|ASC|DESC|LIMIT|OFFSET'
  924. r'|TODAY|NOW|TRUE|FALSE|NULL|EXISTS)\b', Keyword),
  925. (r'[+*/<>=%-]', Operator),
  926. (r'(Any|is|instance_of|CWEType|CWRelation)\b', Name.Builtin),
  927. (r'[0-9]+', Number.Integer),
  928. (r'[A-Z_]\w*\??', Name),
  929. (r"'(''|[^'])*'", String.Single),
  930. (r'"(""|[^"])*"', String.Single),
  931. (r'[;:()\[\],.]', Punctuation)
  932. ],
  933. }