mirror of
				https://github.com/JKorf/CryptoExchange.Net
				synced 2025-10-30 18:07:42 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			355 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
			
		
		
	
	
			355 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
| import * as regex from './regex';
 | |
| import { inherit } from './utils';
 | |
| 
 | |
| // keywords that should have no default relevance value
 | |
| var COMMON_KEYWORDS = 'of and for in not or if then'.split(' ');
 | |
| 
 | |
| // compilation
 | |
| 
 | |
| export function compileLanguage(language) {
 | |
| 
 | |
|   function langRe(value, global) {
 | |
|     return new RegExp(
 | |
|       regex.source(value),
 | |
|       'm' + (language.case_insensitive ? 'i' : '') + (global ? 'g' : '')
 | |
|     );
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|     Stores multiple regular expressions and allows you to quickly search for
 | |
|     them all in a string simultaneously - returning the first match.  It does
 | |
|     this by creating a huge (a|b|c) regex - each individual item wrapped with ()
 | |
|     and joined by `|` - using match groups to track position.  When a match is
 | |
|     found checking which position in the array has content allows us to figure
 | |
|     out which of the original regexes / match groups triggered the match.
 | |
| 
 | |
|     The match object itself (the result of `Regex.exec`) is returned but also
 | |
|     enhanced by merging in any meta-data that was registered with the regex.
 | |
|     This is how we keep track of which mode matched, and what type of rule
 | |
|     (`illegal`, `begin`, end, etc).
 | |
|   */
 | |
|   class MultiRegex {
 | |
|     constructor() {
 | |
|       this.matchIndexes = {};
 | |
|       this.regexes = [];
 | |
|       this.matchAt = 1;
 | |
|       this.position = 0;
 | |
|     }
 | |
| 
 | |
|     addRule(re, opts) {
 | |
|       opts.position = this.position++;
 | |
|       this.matchIndexes[this.matchAt] = opts;
 | |
|       this.regexes.push([opts, re]);
 | |
|       this.matchAt += regex.countMatchGroups(re) + 1;
 | |
|     }
 | |
| 
 | |
|     compile() {
 | |
|       if (this.regexes.length === 0) {
 | |
|         // avoids the need to check length every time exec is called
 | |
|         this.exec = () => null;
 | |
|       }
 | |
|       const terminators = this.regexes.map(el => el[1]);
 | |
|       this.matcherRe = langRe(regex.join(terminators, '|'), true);
 | |
|       this.lastIndex = 0;
 | |
|     }
 | |
| 
 | |
|     exec(s) {
 | |
|       this.matcherRe.lastIndex = this.lastIndex;
 | |
|       const match = this.matcherRe.exec(s);
 | |
|       if (!match) { return null; }
 | |
| 
 | |
|       // eslint-disable-next-line no-undefined
 | |
|       const i = match.findIndex((el, i) => i > 0 && el !== undefined);
 | |
|       const matchData = this.matchIndexes[i];
 | |
|       // trim off any earlier non-relevant match groups (ie, the other regex
 | |
|       // match groups that make up the multi-matcher)
 | |
|       match.splice(0, i);
 | |
| 
 | |
|       return Object.assign(match, matchData);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|     Created to solve the key deficiently with MultiRegex - there is no way to
 | |
|     test for multiple matches at a single location.  Why would we need to do
 | |
|     that?  In the future a more dynamic engine will allow certain matches to be
 | |
|     ignored.  An example: if we matched say the 3rd regex in a large group but
 | |
|     decided to ignore it - we'd need to started testing again at the 4th
 | |
|     regex... but MultiRegex itself gives us no real way to do that.
 | |
| 
 | |
|     So what this class creates MultiRegexs on the fly for whatever search
 | |
|     position they are needed.
 | |
| 
 | |
|     NOTE: These additional MultiRegex objects are created dynamically.  For most
 | |
|     grammars most of the time we will never actually need anything more than the
 | |
|     first MultiRegex - so this shouldn't have too much overhead.
 | |
| 
 | |
|     Say this is our search group, and we match regex3, but wish to ignore it.
 | |
| 
 | |
|       regex1 | regex2 | regex3 | regex4 | regex5    ' ie, startAt = 0
 | |
| 
 | |
|     What we need is a new MultiRegex that only includes the remaining
 | |
|     possibilities:
 | |
| 
 | |
|       regex4 | regex5                               ' ie, startAt = 3
 | |
| 
 | |
|     This class wraps all that complexity up in a simple API... `startAt` decides
 | |
|     where in the array of expressions to start doing the matching. It
 | |
|     auto-increments, so if a match is found at position 2, then startAt will be
 | |
|     set to 3.  If the end is reached startAt will return to 0.
 | |
| 
 | |
|     MOST of the time the parser will be setting startAt manually to 0.
 | |
|   */
 | |
|   class ResumableMultiRegex {
 | |
|     constructor() {
 | |
|       this.rules = [];
 | |
|       this.multiRegexes = [];
 | |
|       this.count = 0;
 | |
| 
 | |
|       this.lastIndex = 0;
 | |
|       this.regexIndex = 0;
 | |
|     }
 | |
| 
 | |
|     getMatcher(index) {
 | |
|       if (this.multiRegexes[index]) return this.multiRegexes[index];
 | |
| 
 | |
|       const matcher = new MultiRegex();
 | |
|       this.rules.slice(index).forEach(([re, opts]) => matcher.addRule(re, opts));
 | |
|       matcher.compile();
 | |
|       this.multiRegexes[index] = matcher;
 | |
|       return matcher;
 | |
|     }
 | |
| 
 | |
|     considerAll() {
 | |
|       this.regexIndex = 0;
 | |
|     }
 | |
| 
 | |
|     addRule(re, opts) {
 | |
|       this.rules.push([re, opts]);
 | |
|       if (opts.type === "begin") this.count++;
 | |
|     }
 | |
| 
 | |
|     exec(s) {
 | |
|       const m = this.getMatcher(this.regexIndex);
 | |
|       m.lastIndex = this.lastIndex;
 | |
|       const result = m.exec(s);
 | |
|       if (result) {
 | |
|         this.regexIndex += result.position + 1;
 | |
|         if (this.regexIndex === this.count) { // wrap-around
 | |
|           this.regexIndex = 0;
 | |
|         }
 | |
|       }
 | |
| 
 | |
|       // this.regexIndex = 0;
 | |
|       return result;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   function buildModeRegex(mode) {
 | |
|     const mm = new ResumableMultiRegex();
 | |
| 
 | |
|     mode.contains.forEach(term => mm.addRule(term.begin, { rule: term, type: "begin" }));
 | |
| 
 | |
|     if (mode.terminator_end) {
 | |
|       mm.addRule(mode.terminator_end, { type: "end" });
 | |
|     }
 | |
|     if (mode.illegal) {
 | |
|       mm.addRule(mode.illegal, { type: "illegal" });
 | |
|     }
 | |
| 
 | |
|     return mm;
 | |
|   }
 | |
| 
 | |
|   // TODO: We need negative look-behind support to do this properly
 | |
|   function skipIfhasPrecedingOrTrailingDot(match, resp) {
 | |
|     const before = match.input[match.index - 1];
 | |
|     const after = match.input[match.index + match[0].length];
 | |
|     if (before === "." || after === ".") {
 | |
|       resp.ignoreMatch();
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   /** skip vs abort vs ignore
 | |
|    *
 | |
|    * @skip   - The mode is still entered and exited normally (and contains rules apply),
 | |
|    *           but all content is held and added to the parent buffer rather than being
 | |
|    *           output when the mode ends.  Mostly used with `sublanguage` to build up
 | |
|    *           a single large buffer than can be parsed by sublanguage.
 | |
|    *
 | |
|    *             - The mode begin ands ends normally.
 | |
|    *             - Content matched is added to the parent mode buffer.
 | |
|    *             - The parser cursor is moved forward normally.
 | |
|    *
 | |
|    * @abort  - A hack placeholder until we have ignore.  Aborts the mode (as if it
 | |
|    *           never matched) but DOES NOT continue to match subsequent `contains`
 | |
|    *           modes.  Abort is bad/suboptimal because it can result in modes
 | |
|    *           farther down not getting applied because an earlier rule eats the
 | |
|    *           content but then aborts.
 | |
|    *
 | |
|    *             - The mode does not begin.
 | |
|    *             - Content matched by `begin` is added to the mode buffer.
 | |
|    *             - The parser cursor is moved forward accordingly.
 | |
|    *
 | |
|    * @ignore - Ignores the mode (as if it never matched) and continues to match any
 | |
|    *           subsequent `contains` modes.  Ignore isn't technically possible with
 | |
|    *           the current parser implementation.
 | |
|    *
 | |
|    *             - The mode does not begin.
 | |
|    *             - Content matched by `begin` is ignored.
 | |
|    *             - The parser cursor is not moved forward.
 | |
|    */
 | |
| 
 | |
|   function compileMode(mode, parent) {
 | |
|     if (mode.compiled) return;
 | |
|     mode.compiled = true;
 | |
| 
 | |
|     // __beforeBegin is considered private API, internal use only
 | |
|     mode.__beforeBegin = null;
 | |
| 
 | |
|     mode.keywords = mode.keywords || mode.beginKeywords;
 | |
| 
 | |
|     let kw_pattern = null;
 | |
|     if (typeof mode.keywords === "object") {
 | |
|       kw_pattern = mode.keywords.$pattern;
 | |
|       delete mode.keywords.$pattern;
 | |
|     }
 | |
| 
 | |
|     if (mode.keywords) {
 | |
|       mode.keywords = compileKeywords(mode.keywords, language.case_insensitive);
 | |
|     }
 | |
| 
 | |
|     // both are not allowed
 | |
|     if (mode.lexemes && kw_pattern) {
 | |
|       throw new Error("ERR: Prefer `keywords.$pattern` to `mode.lexemes`, BOTH are not allowed. (see mode reference) ");
 | |
|     }
 | |
| 
 | |
|     // `mode.lexemes` was the old standard before we added and now recommend
 | |
|     // using `keywords.$pattern` to pass the keyword pattern
 | |
|     mode.keywordPatternRe = langRe(mode.lexemes || kw_pattern || /\w+/, true);
 | |
| 
 | |
|     if (parent) {
 | |
|       if (mode.beginKeywords) {
 | |
|         // for languages with keywords that include non-word characters checking for
 | |
|         // a word boundary is not sufficient, so instead we check for a word boundary
 | |
|         // or whitespace - this does no harm in any case since our keyword engine
 | |
|         // doesn't allow spaces in keywords anyways and we still check for the boundary
 | |
|         // first
 | |
|         mode.begin = '\\b(' + mode.beginKeywords.split(' ').join('|') + ')(?=\\b|\\s)';
 | |
|         mode.__beforeBegin = skipIfhasPrecedingOrTrailingDot;
 | |
|       }
 | |
|       if (!mode.begin)
 | |
|         mode.begin = /\B|\b/;
 | |
|       mode.beginRe = langRe(mode.begin);
 | |
|       if (mode.endSameAsBegin)
 | |
|         mode.end = mode.begin;
 | |
|       if (!mode.end && !mode.endsWithParent)
 | |
|         mode.end = /\B|\b/;
 | |
|       if (mode.end)
 | |
|         mode.endRe = langRe(mode.end);
 | |
|       mode.terminator_end = regex.source(mode.end) || '';
 | |
|       if (mode.endsWithParent && parent.terminator_end)
 | |
|         mode.terminator_end += (mode.end ? '|' : '') + parent.terminator_end;
 | |
|     }
 | |
|     if (mode.illegal)
 | |
|       mode.illegalRe = langRe(mode.illegal);
 | |
|     if (mode.relevance == null)
 | |
|       mode.relevance = 1;
 | |
|     if (!mode.contains) {
 | |
|       mode.contains = [];
 | |
|     }
 | |
|     mode.contains = [].concat(...mode.contains.map(function(c) {
 | |
|       return expand_or_clone_mode(c === 'self' ? mode : c);
 | |
|     }));
 | |
|     mode.contains.forEach(function(c) { compileMode(c, mode); });
 | |
| 
 | |
|     if (mode.starts) {
 | |
|       compileMode(mode.starts, parent);
 | |
|     }
 | |
| 
 | |
|     mode.matcher = buildModeRegex(mode);
 | |
|   }
 | |
| 
 | |
|   // self is not valid at the top-level
 | |
|   if (language.contains && language.contains.includes('self')) {
 | |
|     throw new Error("ERR: contains `self` is not supported at the top-level of a language.  See documentation.");
 | |
|   }
 | |
|   compileMode(language);
 | |
| }
 | |
| 
 | |
| function dependencyOnParent(mode) {
 | |
|   if (!mode) return false;
 | |
| 
 | |
|   return mode.endsWithParent || dependencyOnParent(mode.starts);
 | |
| }
 | |
| 
 | |
| function expand_or_clone_mode(mode) {
 | |
|   if (mode.variants && !mode.cached_variants) {
 | |
|     mode.cached_variants = mode.variants.map(function(variant) {
 | |
|       return inherit(mode, { variants: null }, variant);
 | |
|     });
 | |
|   }
 | |
| 
 | |
|   // EXPAND
 | |
|   // if we have variants then essentially "replace" the mode with the variants
 | |
|   // this happens in compileMode, where this function is called from
 | |
|   if (mode.cached_variants) {
 | |
|     return mode.cached_variants;
 | |
|   }
 | |
| 
 | |
|   // CLONE
 | |
|   // if we have dependencies on parents then we need a unique
 | |
|   // instance of ourselves, so we can be reused with many
 | |
|   // different parents without issue
 | |
|   if (dependencyOnParent(mode)) {
 | |
|     return inherit(mode, { starts: mode.starts ? inherit(mode.starts) : null });
 | |
|   }
 | |
| 
 | |
|   if (Object.isFrozen(mode)) {
 | |
|     return inherit(mode);
 | |
|   }
 | |
| 
 | |
|   // no special dependency issues, just return ourselves
 | |
|   return mode;
 | |
| }
 | |
| 
 | |
| // keywords
 | |
| 
 | |
| function compileKeywords(rawKeywords, case_insensitive) {
 | |
|   var compiled_keywords = {};
 | |
| 
 | |
|   if (typeof rawKeywords === 'string') { // string
 | |
|     splitAndCompile('keyword', rawKeywords);
 | |
|   } else {
 | |
|     Object.keys(rawKeywords).forEach(function(className) {
 | |
|       splitAndCompile(className, rawKeywords[className]);
 | |
|     });
 | |
|   }
 | |
|   return compiled_keywords;
 | |
| 
 | |
|   // ---
 | |
| 
 | |
|   function splitAndCompile(className, str) {
 | |
|     if (case_insensitive) {
 | |
|       str = str.toLowerCase();
 | |
|     }
 | |
|     str.split(' ').forEach(function(keyword) {
 | |
|       var pair = keyword.split('|');
 | |
|       compiled_keywords[pair[0]] = [className, scoreForKeyword(pair[0], pair[1])];
 | |
|     });
 | |
|   }
 | |
| }
 | |
| 
 | |
| function scoreForKeyword(keyword, providedScore) {
 | |
|   // manual scores always win over common keywords
 | |
|   // so you can force a score of 1 if you really insist
 | |
|   if (providedScore) {
 | |
|     return Number(providedScore);
 | |
|   }
 | |
| 
 | |
|   return commonKeyword(keyword) ? 0 : 1;
 | |
| }
 | |
| 
 | |
| function commonKeyword(word) {
 | |
|   return COMMON_KEYWORDS.includes(word.toLowerCase());
 | |
| }
 |