mirror of
				https://github.com/JKorf/CryptoExchange.Net
				synced 2025-11-03 20:07:33 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			355 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
			
		
		
	
	
			355 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
import * as regex from './regex';
 | 
						|
import { inherit } from './utils';
 | 
						|
 | 
						|
// keywords that should have no default relevance value
 | 
						|
var COMMON_KEYWORDS = 'of and for in not or if then'.split(' ');
 | 
						|
 | 
						|
// compilation
 | 
						|
 | 
						|
export function compileLanguage(language) {
 | 
						|
 | 
						|
  function langRe(value, global) {
 | 
						|
    return new RegExp(
 | 
						|
      regex.source(value),
 | 
						|
      'm' + (language.case_insensitive ? 'i' : '') + (global ? 'g' : '')
 | 
						|
    );
 | 
						|
  }
 | 
						|
 | 
						|
  /**
 | 
						|
    Stores multiple regular expressions and allows you to quickly search for
 | 
						|
    them all in a string simultaneously - returning the first match.  It does
 | 
						|
    this by creating a huge (a|b|c) regex - each individual item wrapped with ()
 | 
						|
    and joined by `|` - using match groups to track position.  When a match is
 | 
						|
    found checking which position in the array has content allows us to figure
 | 
						|
    out which of the original regexes / match groups triggered the match.
 | 
						|
 | 
						|
    The match object itself (the result of `Regex.exec`) is returned but also
 | 
						|
    enhanced by merging in any meta-data that was registered with the regex.
 | 
						|
    This is how we keep track of which mode matched, and what type of rule
 | 
						|
    (`illegal`, `begin`, end, etc).
 | 
						|
  */
 | 
						|
  class MultiRegex {
 | 
						|
    constructor() {
 | 
						|
      this.matchIndexes = {};
 | 
						|
      this.regexes = [];
 | 
						|
      this.matchAt = 1;
 | 
						|
      this.position = 0;
 | 
						|
    }
 | 
						|
 | 
						|
    addRule(re, opts) {
 | 
						|
      opts.position = this.position++;
 | 
						|
      this.matchIndexes[this.matchAt] = opts;
 | 
						|
      this.regexes.push([opts, re]);
 | 
						|
      this.matchAt += regex.countMatchGroups(re) + 1;
 | 
						|
    }
 | 
						|
 | 
						|
    compile() {
 | 
						|
      if (this.regexes.length === 0) {
 | 
						|
        // avoids the need to check length every time exec is called
 | 
						|
        this.exec = () => null;
 | 
						|
      }
 | 
						|
      const terminators = this.regexes.map(el => el[1]);
 | 
						|
      this.matcherRe = langRe(regex.join(terminators, '|'), true);
 | 
						|
      this.lastIndex = 0;
 | 
						|
    }
 | 
						|
 | 
						|
    exec(s) {
 | 
						|
      this.matcherRe.lastIndex = this.lastIndex;
 | 
						|
      const match = this.matcherRe.exec(s);
 | 
						|
      if (!match) { return null; }
 | 
						|
 | 
						|
      // eslint-disable-next-line no-undefined
 | 
						|
      const i = match.findIndex((el, i) => i > 0 && el !== undefined);
 | 
						|
      const matchData = this.matchIndexes[i];
 | 
						|
      // trim off any earlier non-relevant match groups (ie, the other regex
 | 
						|
      // match groups that make up the multi-matcher)
 | 
						|
      match.splice(0, i);
 | 
						|
 | 
						|
      return Object.assign(match, matchData);
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  /*
 | 
						|
    Created to solve the key deficiently with MultiRegex - there is no way to
 | 
						|
    test for multiple matches at a single location.  Why would we need to do
 | 
						|
    that?  In the future a more dynamic engine will allow certain matches to be
 | 
						|
    ignored.  An example: if we matched say the 3rd regex in a large group but
 | 
						|
    decided to ignore it - we'd need to started testing again at the 4th
 | 
						|
    regex... but MultiRegex itself gives us no real way to do that.
 | 
						|
 | 
						|
    So what this class creates MultiRegexs on the fly for whatever search
 | 
						|
    position they are needed.
 | 
						|
 | 
						|
    NOTE: These additional MultiRegex objects are created dynamically.  For most
 | 
						|
    grammars most of the time we will never actually need anything more than the
 | 
						|
    first MultiRegex - so this shouldn't have too much overhead.
 | 
						|
 | 
						|
    Say this is our search group, and we match regex3, but wish to ignore it.
 | 
						|
 | 
						|
      regex1 | regex2 | regex3 | regex4 | regex5    ' ie, startAt = 0
 | 
						|
 | 
						|
    What we need is a new MultiRegex that only includes the remaining
 | 
						|
    possibilities:
 | 
						|
 | 
						|
      regex4 | regex5                               ' ie, startAt = 3
 | 
						|
 | 
						|
    This class wraps all that complexity up in a simple API... `startAt` decides
 | 
						|
    where in the array of expressions to start doing the matching. It
 | 
						|
    auto-increments, so if a match is found at position 2, then startAt will be
 | 
						|
    set to 3.  If the end is reached startAt will return to 0.
 | 
						|
 | 
						|
    MOST of the time the parser will be setting startAt manually to 0.
 | 
						|
  */
 | 
						|
  class ResumableMultiRegex {
 | 
						|
    constructor() {
 | 
						|
      this.rules = [];
 | 
						|
      this.multiRegexes = [];
 | 
						|
      this.count = 0;
 | 
						|
 | 
						|
      this.lastIndex = 0;
 | 
						|
      this.regexIndex = 0;
 | 
						|
    }
 | 
						|
 | 
						|
    getMatcher(index) {
 | 
						|
      if (this.multiRegexes[index]) return this.multiRegexes[index];
 | 
						|
 | 
						|
      const matcher = new MultiRegex();
 | 
						|
      this.rules.slice(index).forEach(([re, opts]) => matcher.addRule(re, opts));
 | 
						|
      matcher.compile();
 | 
						|
      this.multiRegexes[index] = matcher;
 | 
						|
      return matcher;
 | 
						|
    }
 | 
						|
 | 
						|
    considerAll() {
 | 
						|
      this.regexIndex = 0;
 | 
						|
    }
 | 
						|
 | 
						|
    addRule(re, opts) {
 | 
						|
      this.rules.push([re, opts]);
 | 
						|
      if (opts.type === "begin") this.count++;
 | 
						|
    }
 | 
						|
 | 
						|
    exec(s) {
 | 
						|
      const m = this.getMatcher(this.regexIndex);
 | 
						|
      m.lastIndex = this.lastIndex;
 | 
						|
      const result = m.exec(s);
 | 
						|
      if (result) {
 | 
						|
        this.regexIndex += result.position + 1;
 | 
						|
        if (this.regexIndex === this.count) { // wrap-around
 | 
						|
          this.regexIndex = 0;
 | 
						|
        }
 | 
						|
      }
 | 
						|
 | 
						|
      // this.regexIndex = 0;
 | 
						|
      return result;
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  function buildModeRegex(mode) {
 | 
						|
    const mm = new ResumableMultiRegex();
 | 
						|
 | 
						|
    mode.contains.forEach(term => mm.addRule(term.begin, { rule: term, type: "begin" }));
 | 
						|
 | 
						|
    if (mode.terminator_end) {
 | 
						|
      mm.addRule(mode.terminator_end, { type: "end" });
 | 
						|
    }
 | 
						|
    if (mode.illegal) {
 | 
						|
      mm.addRule(mode.illegal, { type: "illegal" });
 | 
						|
    }
 | 
						|
 | 
						|
    return mm;
 | 
						|
  }
 | 
						|
 | 
						|
  // TODO: We need negative look-behind support to do this properly
 | 
						|
  function skipIfhasPrecedingOrTrailingDot(match, resp) {
 | 
						|
    const before = match.input[match.index - 1];
 | 
						|
    const after = match.input[match.index + match[0].length];
 | 
						|
    if (before === "." || after === ".") {
 | 
						|
      resp.ignoreMatch();
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  /** skip vs abort vs ignore
 | 
						|
   *
 | 
						|
   * @skip   - The mode is still entered and exited normally (and contains rules apply),
 | 
						|
   *           but all content is held and added to the parent buffer rather than being
 | 
						|
   *           output when the mode ends.  Mostly used with `sublanguage` to build up
 | 
						|
   *           a single large buffer than can be parsed by sublanguage.
 | 
						|
   *
 | 
						|
   *             - The mode begin ands ends normally.
 | 
						|
   *             - Content matched is added to the parent mode buffer.
 | 
						|
   *             - The parser cursor is moved forward normally.
 | 
						|
   *
 | 
						|
   * @abort  - A hack placeholder until we have ignore.  Aborts the mode (as if it
 | 
						|
   *           never matched) but DOES NOT continue to match subsequent `contains`
 | 
						|
   *           modes.  Abort is bad/suboptimal because it can result in modes
 | 
						|
   *           farther down not getting applied because an earlier rule eats the
 | 
						|
   *           content but then aborts.
 | 
						|
   *
 | 
						|
   *             - The mode does not begin.
 | 
						|
   *             - Content matched by `begin` is added to the mode buffer.
 | 
						|
   *             - The parser cursor is moved forward accordingly.
 | 
						|
   *
 | 
						|
   * @ignore - Ignores the mode (as if it never matched) and continues to match any
 | 
						|
   *           subsequent `contains` modes.  Ignore isn't technically possible with
 | 
						|
   *           the current parser implementation.
 | 
						|
   *
 | 
						|
   *             - The mode does not begin.
 | 
						|
   *             - Content matched by `begin` is ignored.
 | 
						|
   *             - The parser cursor is not moved forward.
 | 
						|
   */
 | 
						|
 | 
						|
  function compileMode(mode, parent) {
 | 
						|
    if (mode.compiled) return;
 | 
						|
    mode.compiled = true;
 | 
						|
 | 
						|
    // __beforeBegin is considered private API, internal use only
 | 
						|
    mode.__beforeBegin = null;
 | 
						|
 | 
						|
    mode.keywords = mode.keywords || mode.beginKeywords;
 | 
						|
 | 
						|
    let kw_pattern = null;
 | 
						|
    if (typeof mode.keywords === "object") {
 | 
						|
      kw_pattern = mode.keywords.$pattern;
 | 
						|
      delete mode.keywords.$pattern;
 | 
						|
    }
 | 
						|
 | 
						|
    if (mode.keywords) {
 | 
						|
      mode.keywords = compileKeywords(mode.keywords, language.case_insensitive);
 | 
						|
    }
 | 
						|
 | 
						|
    // both are not allowed
 | 
						|
    if (mode.lexemes && kw_pattern) {
 | 
						|
      throw new Error("ERR: Prefer `keywords.$pattern` to `mode.lexemes`, BOTH are not allowed. (see mode reference) ");
 | 
						|
    }
 | 
						|
 | 
						|
    // `mode.lexemes` was the old standard before we added and now recommend
 | 
						|
    // using `keywords.$pattern` to pass the keyword pattern
 | 
						|
    mode.keywordPatternRe = langRe(mode.lexemes || kw_pattern || /\w+/, true);
 | 
						|
 | 
						|
    if (parent) {
 | 
						|
      if (mode.beginKeywords) {
 | 
						|
        // for languages with keywords that include non-word characters checking for
 | 
						|
        // a word boundary is not sufficient, so instead we check for a word boundary
 | 
						|
        // or whitespace - this does no harm in any case since our keyword engine
 | 
						|
        // doesn't allow spaces in keywords anyways and we still check for the boundary
 | 
						|
        // first
 | 
						|
        mode.begin = '\\b(' + mode.beginKeywords.split(' ').join('|') + ')(?=\\b|\\s)';
 | 
						|
        mode.__beforeBegin = skipIfhasPrecedingOrTrailingDot;
 | 
						|
      }
 | 
						|
      if (!mode.begin)
 | 
						|
        mode.begin = /\B|\b/;
 | 
						|
      mode.beginRe = langRe(mode.begin);
 | 
						|
      if (mode.endSameAsBegin)
 | 
						|
        mode.end = mode.begin;
 | 
						|
      if (!mode.end && !mode.endsWithParent)
 | 
						|
        mode.end = /\B|\b/;
 | 
						|
      if (mode.end)
 | 
						|
        mode.endRe = langRe(mode.end);
 | 
						|
      mode.terminator_end = regex.source(mode.end) || '';
 | 
						|
      if (mode.endsWithParent && parent.terminator_end)
 | 
						|
        mode.terminator_end += (mode.end ? '|' : '') + parent.terminator_end;
 | 
						|
    }
 | 
						|
    if (mode.illegal)
 | 
						|
      mode.illegalRe = langRe(mode.illegal);
 | 
						|
    if (mode.relevance == null)
 | 
						|
      mode.relevance = 1;
 | 
						|
    if (!mode.contains) {
 | 
						|
      mode.contains = [];
 | 
						|
    }
 | 
						|
    mode.contains = [].concat(...mode.contains.map(function(c) {
 | 
						|
      return expand_or_clone_mode(c === 'self' ? mode : c);
 | 
						|
    }));
 | 
						|
    mode.contains.forEach(function(c) { compileMode(c, mode); });
 | 
						|
 | 
						|
    if (mode.starts) {
 | 
						|
      compileMode(mode.starts, parent);
 | 
						|
    }
 | 
						|
 | 
						|
    mode.matcher = buildModeRegex(mode);
 | 
						|
  }
 | 
						|
 | 
						|
  // self is not valid at the top-level
 | 
						|
  if (language.contains && language.contains.includes('self')) {
 | 
						|
    throw new Error("ERR: contains `self` is not supported at the top-level of a language.  See documentation.");
 | 
						|
  }
 | 
						|
  compileMode(language);
 | 
						|
}
 | 
						|
 | 
						|
function dependencyOnParent(mode) {
 | 
						|
  if (!mode) return false;
 | 
						|
 | 
						|
  return mode.endsWithParent || dependencyOnParent(mode.starts);
 | 
						|
}
 | 
						|
 | 
						|
function expand_or_clone_mode(mode) {
 | 
						|
  if (mode.variants && !mode.cached_variants) {
 | 
						|
    mode.cached_variants = mode.variants.map(function(variant) {
 | 
						|
      return inherit(mode, { variants: null }, variant);
 | 
						|
    });
 | 
						|
  }
 | 
						|
 | 
						|
  // EXPAND
 | 
						|
  // if we have variants then essentially "replace" the mode with the variants
 | 
						|
  // this happens in compileMode, where this function is called from
 | 
						|
  if (mode.cached_variants) {
 | 
						|
    return mode.cached_variants;
 | 
						|
  }
 | 
						|
 | 
						|
  // CLONE
 | 
						|
  // if we have dependencies on parents then we need a unique
 | 
						|
  // instance of ourselves, so we can be reused with many
 | 
						|
  // different parents without issue
 | 
						|
  if (dependencyOnParent(mode)) {
 | 
						|
    return inherit(mode, { starts: mode.starts ? inherit(mode.starts) : null });
 | 
						|
  }
 | 
						|
 | 
						|
  if (Object.isFrozen(mode)) {
 | 
						|
    return inherit(mode);
 | 
						|
  }
 | 
						|
 | 
						|
  // no special dependency issues, just return ourselves
 | 
						|
  return mode;
 | 
						|
}
 | 
						|
 | 
						|
// keywords
 | 
						|
 | 
						|
function compileKeywords(rawKeywords, case_insensitive) {
 | 
						|
  var compiled_keywords = {};
 | 
						|
 | 
						|
  if (typeof rawKeywords === 'string') { // string
 | 
						|
    splitAndCompile('keyword', rawKeywords);
 | 
						|
  } else {
 | 
						|
    Object.keys(rawKeywords).forEach(function(className) {
 | 
						|
      splitAndCompile(className, rawKeywords[className]);
 | 
						|
    });
 | 
						|
  }
 | 
						|
  return compiled_keywords;
 | 
						|
 | 
						|
  // ---
 | 
						|
 | 
						|
  function splitAndCompile(className, str) {
 | 
						|
    if (case_insensitive) {
 | 
						|
      str = str.toLowerCase();
 | 
						|
    }
 | 
						|
    str.split(' ').forEach(function(keyword) {
 | 
						|
      var pair = keyword.split('|');
 | 
						|
      compiled_keywords[pair[0]] = [className, scoreForKeyword(pair[0], pair[1])];
 | 
						|
    });
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
function scoreForKeyword(keyword, providedScore) {
 | 
						|
  // manual scores always win over common keywords
 | 
						|
  // so you can force a score of 1 if you really insist
 | 
						|
  if (providedScore) {
 | 
						|
    return Number(providedScore);
 | 
						|
  }
 | 
						|
 | 
						|
  return commonKeyword(keyword) ? 0 : 1;
 | 
						|
}
 | 
						|
 | 
						|
function commonKeyword(word) {
 | 
						|
  return COMMON_KEYWORDS.includes(word.toLowerCase());
 | 
						|
}
 |