mirror of
https://github.com/JKorf/CryptoExchange.Net
synced 2025-06-08 08:26:20 +00:00
355 lines
12 KiB
JavaScript
355 lines
12 KiB
JavaScript
import * as regex from './regex';
|
|
import { inherit } from './utils';
|
|
|
|
// keywords that should have no default relevance value
|
|
var COMMON_KEYWORDS = 'of and for in not or if then'.split(' ');
|
|
|
|
// compilation
|
|
|
|
export function compileLanguage(language) {
|
|
|
|
function langRe(value, global) {
|
|
return new RegExp(
|
|
regex.source(value),
|
|
'm' + (language.case_insensitive ? 'i' : '') + (global ? 'g' : '')
|
|
);
|
|
}
|
|
|
|
/**
|
|
Stores multiple regular expressions and allows you to quickly search for
|
|
them all in a string simultaneously - returning the first match. It does
|
|
this by creating a huge (a|b|c) regex - each individual item wrapped with ()
|
|
and joined by `|` - using match groups to track position. When a match is
|
|
found checking which position in the array has content allows us to figure
|
|
out which of the original regexes / match groups triggered the match.
|
|
|
|
The match object itself (the result of `Regex.exec`) is returned but also
|
|
enhanced by merging in any meta-data that was registered with the regex.
|
|
This is how we keep track of which mode matched, and what type of rule
|
|
(`illegal`, `begin`, end, etc).
|
|
*/
|
|
class MultiRegex {
|
|
constructor() {
|
|
this.matchIndexes = {};
|
|
this.regexes = [];
|
|
this.matchAt = 1;
|
|
this.position = 0;
|
|
}
|
|
|
|
addRule(re, opts) {
|
|
opts.position = this.position++;
|
|
this.matchIndexes[this.matchAt] = opts;
|
|
this.regexes.push([opts, re]);
|
|
this.matchAt += regex.countMatchGroups(re) + 1;
|
|
}
|
|
|
|
compile() {
|
|
if (this.regexes.length === 0) {
|
|
// avoids the need to check length every time exec is called
|
|
this.exec = () => null;
|
|
}
|
|
const terminators = this.regexes.map(el => el[1]);
|
|
this.matcherRe = langRe(regex.join(terminators, '|'), true);
|
|
this.lastIndex = 0;
|
|
}
|
|
|
|
exec(s) {
|
|
this.matcherRe.lastIndex = this.lastIndex;
|
|
const match = this.matcherRe.exec(s);
|
|
if (!match) { return null; }
|
|
|
|
// eslint-disable-next-line no-undefined
|
|
const i = match.findIndex((el, i) => i > 0 && el !== undefined);
|
|
const matchData = this.matchIndexes[i];
|
|
// trim off any earlier non-relevant match groups (ie, the other regex
|
|
// match groups that make up the multi-matcher)
|
|
match.splice(0, i);
|
|
|
|
return Object.assign(match, matchData);
|
|
}
|
|
}
|
|
|
|
/*
|
|
Created to solve the key deficiently with MultiRegex - there is no way to
|
|
test for multiple matches at a single location. Why would we need to do
|
|
that? In the future a more dynamic engine will allow certain matches to be
|
|
ignored. An example: if we matched say the 3rd regex in a large group but
|
|
decided to ignore it - we'd need to started testing again at the 4th
|
|
regex... but MultiRegex itself gives us no real way to do that.
|
|
|
|
So what this class creates MultiRegexs on the fly for whatever search
|
|
position they are needed.
|
|
|
|
NOTE: These additional MultiRegex objects are created dynamically. For most
|
|
grammars most of the time we will never actually need anything more than the
|
|
first MultiRegex - so this shouldn't have too much overhead.
|
|
|
|
Say this is our search group, and we match regex3, but wish to ignore it.
|
|
|
|
regex1 | regex2 | regex3 | regex4 | regex5 ' ie, startAt = 0
|
|
|
|
What we need is a new MultiRegex that only includes the remaining
|
|
possibilities:
|
|
|
|
regex4 | regex5 ' ie, startAt = 3
|
|
|
|
This class wraps all that complexity up in a simple API... `startAt` decides
|
|
where in the array of expressions to start doing the matching. It
|
|
auto-increments, so if a match is found at position 2, then startAt will be
|
|
set to 3. If the end is reached startAt will return to 0.
|
|
|
|
MOST of the time the parser will be setting startAt manually to 0.
|
|
*/
|
|
class ResumableMultiRegex {
|
|
constructor() {
|
|
this.rules = [];
|
|
this.multiRegexes = [];
|
|
this.count = 0;
|
|
|
|
this.lastIndex = 0;
|
|
this.regexIndex = 0;
|
|
}
|
|
|
|
getMatcher(index) {
|
|
if (this.multiRegexes[index]) return this.multiRegexes[index];
|
|
|
|
const matcher = new MultiRegex();
|
|
this.rules.slice(index).forEach(([re, opts]) => matcher.addRule(re, opts));
|
|
matcher.compile();
|
|
this.multiRegexes[index] = matcher;
|
|
return matcher;
|
|
}
|
|
|
|
considerAll() {
|
|
this.regexIndex = 0;
|
|
}
|
|
|
|
addRule(re, opts) {
|
|
this.rules.push([re, opts]);
|
|
if (opts.type === "begin") this.count++;
|
|
}
|
|
|
|
exec(s) {
|
|
const m = this.getMatcher(this.regexIndex);
|
|
m.lastIndex = this.lastIndex;
|
|
const result = m.exec(s);
|
|
if (result) {
|
|
this.regexIndex += result.position + 1;
|
|
if (this.regexIndex === this.count) { // wrap-around
|
|
this.regexIndex = 0;
|
|
}
|
|
}
|
|
|
|
// this.regexIndex = 0;
|
|
return result;
|
|
}
|
|
}
|
|
|
|
function buildModeRegex(mode) {
|
|
const mm = new ResumableMultiRegex();
|
|
|
|
mode.contains.forEach(term => mm.addRule(term.begin, { rule: term, type: "begin" }));
|
|
|
|
if (mode.terminator_end) {
|
|
mm.addRule(mode.terminator_end, { type: "end" });
|
|
}
|
|
if (mode.illegal) {
|
|
mm.addRule(mode.illegal, { type: "illegal" });
|
|
}
|
|
|
|
return mm;
|
|
}
|
|
|
|
// TODO: We need negative look-behind support to do this properly
|
|
function skipIfhasPrecedingOrTrailingDot(match, resp) {
|
|
const before = match.input[match.index - 1];
|
|
const after = match.input[match.index + match[0].length];
|
|
if (before === "." || after === ".") {
|
|
resp.ignoreMatch();
|
|
}
|
|
}
|
|
|
|
/** skip vs abort vs ignore
|
|
*
|
|
* @skip - The mode is still entered and exited normally (and contains rules apply),
|
|
* but all content is held and added to the parent buffer rather than being
|
|
* output when the mode ends. Mostly used with `sublanguage` to build up
|
|
* a single large buffer than can be parsed by sublanguage.
|
|
*
|
|
* - The mode begin ands ends normally.
|
|
* - Content matched is added to the parent mode buffer.
|
|
* - The parser cursor is moved forward normally.
|
|
*
|
|
* @abort - A hack placeholder until we have ignore. Aborts the mode (as if it
|
|
* never matched) but DOES NOT continue to match subsequent `contains`
|
|
* modes. Abort is bad/suboptimal because it can result in modes
|
|
* farther down not getting applied because an earlier rule eats the
|
|
* content but then aborts.
|
|
*
|
|
* - The mode does not begin.
|
|
* - Content matched by `begin` is added to the mode buffer.
|
|
* - The parser cursor is moved forward accordingly.
|
|
*
|
|
* @ignore - Ignores the mode (as if it never matched) and continues to match any
|
|
* subsequent `contains` modes. Ignore isn't technically possible with
|
|
* the current parser implementation.
|
|
*
|
|
* - The mode does not begin.
|
|
* - Content matched by `begin` is ignored.
|
|
* - The parser cursor is not moved forward.
|
|
*/
|
|
|
|
function compileMode(mode, parent) {
|
|
if (mode.compiled) return;
|
|
mode.compiled = true;
|
|
|
|
// __beforeBegin is considered private API, internal use only
|
|
mode.__beforeBegin = null;
|
|
|
|
mode.keywords = mode.keywords || mode.beginKeywords;
|
|
|
|
let kw_pattern = null;
|
|
if (typeof mode.keywords === "object") {
|
|
kw_pattern = mode.keywords.$pattern;
|
|
delete mode.keywords.$pattern;
|
|
}
|
|
|
|
if (mode.keywords) {
|
|
mode.keywords = compileKeywords(mode.keywords, language.case_insensitive);
|
|
}
|
|
|
|
// both are not allowed
|
|
if (mode.lexemes && kw_pattern) {
|
|
throw new Error("ERR: Prefer `keywords.$pattern` to `mode.lexemes`, BOTH are not allowed. (see mode reference) ");
|
|
}
|
|
|
|
// `mode.lexemes` was the old standard before we added and now recommend
|
|
// using `keywords.$pattern` to pass the keyword pattern
|
|
mode.keywordPatternRe = langRe(mode.lexemes || kw_pattern || /\w+/, true);
|
|
|
|
if (parent) {
|
|
if (mode.beginKeywords) {
|
|
// for languages with keywords that include non-word characters checking for
|
|
// a word boundary is not sufficient, so instead we check for a word boundary
|
|
// or whitespace - this does no harm in any case since our keyword engine
|
|
// doesn't allow spaces in keywords anyways and we still check for the boundary
|
|
// first
|
|
mode.begin = '\\b(' + mode.beginKeywords.split(' ').join('|') + ')(?=\\b|\\s)';
|
|
mode.__beforeBegin = skipIfhasPrecedingOrTrailingDot;
|
|
}
|
|
if (!mode.begin)
|
|
mode.begin = /\B|\b/;
|
|
mode.beginRe = langRe(mode.begin);
|
|
if (mode.endSameAsBegin)
|
|
mode.end = mode.begin;
|
|
if (!mode.end && !mode.endsWithParent)
|
|
mode.end = /\B|\b/;
|
|
if (mode.end)
|
|
mode.endRe = langRe(mode.end);
|
|
mode.terminator_end = regex.source(mode.end) || '';
|
|
if (mode.endsWithParent && parent.terminator_end)
|
|
mode.terminator_end += (mode.end ? '|' : '') + parent.terminator_end;
|
|
}
|
|
if (mode.illegal)
|
|
mode.illegalRe = langRe(mode.illegal);
|
|
if (mode.relevance == null)
|
|
mode.relevance = 1;
|
|
if (!mode.contains) {
|
|
mode.contains = [];
|
|
}
|
|
mode.contains = [].concat(...mode.contains.map(function(c) {
|
|
return expand_or_clone_mode(c === 'self' ? mode : c);
|
|
}));
|
|
mode.contains.forEach(function(c) { compileMode(c, mode); });
|
|
|
|
if (mode.starts) {
|
|
compileMode(mode.starts, parent);
|
|
}
|
|
|
|
mode.matcher = buildModeRegex(mode);
|
|
}
|
|
|
|
// self is not valid at the top-level
|
|
if (language.contains && language.contains.includes('self')) {
|
|
throw new Error("ERR: contains `self` is not supported at the top-level of a language. See documentation.");
|
|
}
|
|
compileMode(language);
|
|
}
|
|
|
|
function dependencyOnParent(mode) {
|
|
if (!mode) return false;
|
|
|
|
return mode.endsWithParent || dependencyOnParent(mode.starts);
|
|
}
|
|
|
|
function expand_or_clone_mode(mode) {
|
|
if (mode.variants && !mode.cached_variants) {
|
|
mode.cached_variants = mode.variants.map(function(variant) {
|
|
return inherit(mode, { variants: null }, variant);
|
|
});
|
|
}
|
|
|
|
// EXPAND
|
|
// if we have variants then essentially "replace" the mode with the variants
|
|
// this happens in compileMode, where this function is called from
|
|
if (mode.cached_variants) {
|
|
return mode.cached_variants;
|
|
}
|
|
|
|
// CLONE
|
|
// if we have dependencies on parents then we need a unique
|
|
// instance of ourselves, so we can be reused with many
|
|
// different parents without issue
|
|
if (dependencyOnParent(mode)) {
|
|
return inherit(mode, { starts: mode.starts ? inherit(mode.starts) : null });
|
|
}
|
|
|
|
if (Object.isFrozen(mode)) {
|
|
return inherit(mode);
|
|
}
|
|
|
|
// no special dependency issues, just return ourselves
|
|
return mode;
|
|
}
|
|
|
|
// keywords
|
|
|
|
function compileKeywords(rawKeywords, case_insensitive) {
|
|
var compiled_keywords = {};
|
|
|
|
if (typeof rawKeywords === 'string') { // string
|
|
splitAndCompile('keyword', rawKeywords);
|
|
} else {
|
|
Object.keys(rawKeywords).forEach(function(className) {
|
|
splitAndCompile(className, rawKeywords[className]);
|
|
});
|
|
}
|
|
return compiled_keywords;
|
|
|
|
// ---
|
|
|
|
function splitAndCompile(className, str) {
|
|
if (case_insensitive) {
|
|
str = str.toLowerCase();
|
|
}
|
|
str.split(' ').forEach(function(keyword) {
|
|
var pair = keyword.split('|');
|
|
compiled_keywords[pair[0]] = [className, scoreForKeyword(pair[0], pair[1])];
|
|
});
|
|
}
|
|
}
|
|
|
|
function scoreForKeyword(keyword, providedScore) {
|
|
// manual scores always win over common keywords
|
|
// so you can force a score of 1 if you really insist
|
|
if (providedScore) {
|
|
return Number(providedScore);
|
|
}
|
|
|
|
return commonKeyword(keyword) ? 0 : 1;
|
|
}
|
|
|
|
function commonKeyword(word) {
|
|
return COMMON_KEYWORDS.includes(word.toLowerCase());
|
|
}
|