ldap.schema.tokenizer - Low-level parsing functions for schema element strings

See https://www.python-ldap.org/ for details.

import re

TOKENS_FINDALL = re.compile(
    r"(\()"           # opening parenthesis
    r"|"              # or
    r"(\))"           # closing parenthesis
    r"|"              # or
    r"([^'$()\s]+)"   # string of length >= 1 without '$() or whitespace
    r"|"              # or
                      # any string or empty string surrounded by unescaped
                      # single quotes except if right quote is succeeded by
                      # alphanumeric char
    r"|"              # or
    r"([^\s]+?)",     # residue, all non-whitespace strings

UNESCAPE_PATTERN = re.compile(r"\\(.)")

def split_tokens(s):
    Returns list of syntax elements with quotes and spaces stripped.
    parts = []
    parens = 0
    for opar, cpar, unquoted, quoted, residue in TOKENS_FINDALL(s):
        if unquoted:
        elif quoted:
            parts.append(UNESCAPE_PATTERN.sub(r'\1', quoted[1:-1]))
        elif opar:
            parens += 1
        elif cpar:
            parens -= 1
        elif residue == '$':
            if not parens:
                raise ValueError("'$' outside parenthesis in %r" % (s))
            raise ValueError(residue, s)
    if parens:
        raise ValueError("Unbalanced parenthesis in %r" % (s))
    return parts

def extract_tokens(l,known_tokens):
  Returns dictionary of known tokens with all values
  assert l[0].strip()=="(" and l[-1].strip()==")",ValueError(l)
  result = {}
  i = 0
  l_len = len(l)
  while i<l_len:
    if l[i] in result:
      token = l[i]
      i += 1 # Consume token
      if i<l_len:
        if l[i] in result:
          # non-valued
          result[token] = (())
        elif l[i]=="(":
          # multi-valued
          i += 1 # Consume left parentheses
          start = i
          while i<l_len and l[i]!=")":
            i += 1
          result[token] = tuple(filter(lambda v:v!='$',l[start:i]))
          i += 1 # Consume right parentheses
          # single-valued
          result[token] = l[i],
          i += 1 # Consume single value
      i += 1 # Consume unrecognized item
  return result