/* eslint-disable */
import { Util } from '@libs/utilities/util';
import { EnumLanguage } from '@libs/constants';
import { TextSegmenter } from '../text-segmenter/text-segmenter';

import { StopWordFilter } from '../stop-word-filter';
import { EnumKeywordExtractSorting, IKeywordExtractOptions, IKeywordWithCount, IKeywordWithScore, ITextWeights } from './keyword-extractor.interface';


/**------------------------------------------------------
 * Keyword Extraction Library
 */
export class KeywordExtractor {

	//** Configurations */
	private readonly SUPPORTED_LANGUAGES : EnumLanguage[] = [EnumLanguage.EN, EnumLanguage.DE, EnumLanguage.FR, EnumLanguage.IT, EnumLanguage.ES, EnumLanguage.JA];
	private readonly REQUIRE_SEGMENTATION: EnumLanguage[] = [EnumLanguage.JA];

	constructor(
		private stopWordsFilter	: StopWordFilter,
		private textSegmenter	: TextSegmenter,
	) {
		if (!this.stopWordsFilter) throw new Error(`KeywordExtractor => constructor => FATAL ERROR: the provided stopWordsFilter is not defined`);
		if (!this.textSegmenter)   throw new Error(`KeywordExtractor => constructor => FATAL ERROR: the provided textSegmenter is not defined`);
	}


	/**------------------------------------------------------
	 * Extracts keywords from the sentence
	 */
	extractKeywords(sentence: string, extractOptions: Partial<IKeywordExtractOptions>): string[] {
		return this.extractKeywordsAlgorithm(sentence, extractOptions).map((elem: IKeywordWithCount) => elem.keyword);
	}

	extractKeywordsWithCount(sentence: string, extractOptions: Partial<IKeywordExtractOptions>): IKeywordWithCount[] {
		return this.extractKeywordsAlgorithm(sentence, extractOptions);
	}

	extractKeywordsWithScore(sentence: string, textWeights: ITextWeights[], extractOptions: Partial<IKeywordExtractOptions>): IKeywordWithScore[] {
		const keywords: IKeywordWithCount[] = this.extractKeywordsAlgorithm(sentence, extractOptions);
		return this.generateKeywordsScore(keywords, textWeights);
	}


	/**------------------------------------------------------
	 * Keyword Scoring
	 */
	generateKeywordsScore(keywords: IKeywordWithCount[], textWeights: ITextWeights[]): IKeywordWithScore[] {

		//0 - verify keywords
		if (!Util.Basic.isArray(keywords)) throw new Error(`KeywordsExtractor => generateKeywordsScore => FATAL ERROR : Invalid keywords ${keywords}`);

		//1 - generate scores
		let scoreSum  		 : number 	   		   = 0;
		const mappedKeywords : IKeywordWithScore[] = [];
		for (const keywordData of keywords) {

			//a. calculating the totalScore and count
			let totalScore: number = 0;
			let count	  : number = 0;
			for (let i = 0; i < textWeights.length; i++) {
				const textWeightObj   : ITextWeights  = textWeights[i];

				const totalWeightWords: number		  = textWeightObj.text.split(' ').filter((word: string) => !Util.String.isEmpty(word)).length || 1;
				const foundWords 	  : string[]|null = textWeightObj.text.match(new RegExp(`(?:^|\\s|\\b)${Util.RegExp.escapeRegExp(keywordData.keyword)}(?:$|\\s|\\b)`, 'ig'));	// is the keywords present in the text?
				const foundWordsCount : number		  = (foundWords) ? foundWords.length : 0;
				const score 		  : number   	  = (foundWordsCount/totalWeightWords) * textWeightObj.weight;

				totalScore += score;
				count	   += foundWordsCount;
			}

			//b. returning the mapping
			scoreSum += totalScore;
			mappedKeywords.push({
				keyword  : keywordData.keyword,
				score	 : totalScore,
				count	 : count
			});
		}

		//2 - map final scores (relative scoring to all)
		for (const keyword of mappedKeywords) {
			keyword.score = (scoreSum) ? (keyword.score *100)/scoreSum : 0;
		}

		//3 - return final keywords
		return mappedKeywords;
	}


	/**------------------------------------------------------
	 * Keyword Extraction Algorithm
	 */
	private extractKeywordsAlgorithm(sentence: string, extractOptions: Partial<IKeywordExtractOptions>): IKeywordWithCount[] {

		//0 - purify & validate the options
		const text	 : string 			  = sentence.replace(/(<([^>]+)>)/gi, '').trim();
		const options: IKeywordExtractOptions = this.getOptions(extractOptions);

		if (!Util.Array.contains(this.SUPPORTED_LANGUAGES, options.language)) throw new Error(`KeywordsExtractor => extractKeywordsAlgorithm => FATAL ERROR: language of "${options.language}" must be one of [${this.SUPPORTED_LANGUAGES}]`);
		if (Util.String.isEmpty(text)) 		throw new Error(`KeywordsExtractor => extractKeywordsAlgorithm => FATAL ERROR: provided text of "${sentence}" is empty`);
		if (options.minOccurrenceCount < 1) throw new Error(`KeywordsExtractor => extractKeywordsAlgorithm => FATAL ERROR: provided occurrenceCount of "${options.minOccurrenceCount}" is smaller 1`);
		if (options.minLength < 1) 			throw new Error(`KeywordsExtractor => extractKeywordsAlgorithm => FATAL ERROR: provided minLength of "${options.minLength}" is smaller 1`);
		if (options.maxResults < 1) 		throw new Error(`KeywordsExtractor => extractKeywordsAlgorithm => FATAL ERROR: provided minLength of "${options.maxResults}" is smaller 1`);

		//1 - filter the words by digits, special symbols, case converting is done here, stores the unconverted values
		const words 		: string[] = this.getTextAsWordArray(options.language, text);
		const unchangedWords: string[] = [];
		const lowCasedWords	: string[] = [];
		for (let i = 0; i < words.length; i++) {

			//a. remove periods, question marks, exclamation points, commas, and semi-colons
			let word: string =  words[i].match(/https?:\/\/.*[\r\n]*/g)
									? words[i]
									: words[i].replace(/^-+|-+$/g, '').replace(/\.|,|;|!|\?|\(|\)|:|"|^'|'$|“|”|‘|’|\|/g, '');

			//b. if this is a short result, make sure it's not a single character or something 'odd'
			if (word.length === 1) word = word.replace(/_|@|&|#/g, '');
			if (Util.String.isEmpty(word)) continue;

			//c. check for number and removes the number if removeDigits is true
			const digitsMatch: RegExpMatchArray | null = word.match(/\d/g);
			if (options.removeDigits && digitsMatch && digitsMatch.length === word.length) continue;

			//d. tracks both the converted low letter words and non-converted words
			lowCasedWords.push(word.toLowerCase());
			unchangedWords.push(word);
		}

		//2 - remove all the stop words words and filters the input sentence
		const allKeywords: string[] = this.filterKeywords(lowCasedWords, unchangedWords, options);

		//3 - apply special options
		const filteredKeywords: string[] = this.applySpecialOptions(allKeywords, options);

		//4 - create the keyword objects & add the keyword count
		const results: IKeywordWithCount[] = [];
		for (const keyword of filteredKeywords) {
			results.push({
				keyword : keyword,
				count   : allKeywords.filter((word: string) => word === keyword).length,
			});
		}

		return results;
	}


	/**------------------------------------------------------
	 * Helper Function
	 */
	private getOptions(extractOptions: Partial<IKeywordExtractOptions>): IKeywordExtractOptions {
		const options: IKeywordExtractOptions = {
			language		 	: Util.Basic.fallbackValue(extractOptions.language?.toLowerCase(), EnumLanguage.EN),
			caseInsensitive		: Util.Basic.fallbackValue(extractOptions.caseInsensitive, true),
			maxResults			: Util.Basic.fallbackValue(extractOptions.maxResults, Number.MAX_SAFE_INTEGER),
			removeDigits 	 	: Util.Basic.fallbackValue(extractOptions.removeDigits, false),
			removeDuplicates 	: Util.Basic.fallbackValue(extractOptions.removeDuplicates, true),
			minLength 		 	: Util.Basic.fallbackValue(extractOptions.minLength, 1),
			minOccurrenceCount	: Util.Basic.fallbackValue(extractOptions.minOccurrenceCount, 1),
			chainedWords 	 	: Util.Basic.fallbackValue(extractOptions.chainedWords, false),
			maxNgrams 		 	: Util.Basic.fallbackValue(extractOptions.maxNgrams, 0),
			excludeWords	 	: Util.Basic.fallbackValue(extractOptions.excludeWords, []),
			sorting			 	: Util.Basic.fallbackValue(extractOptions.sorting, EnumKeywordExtractSorting.None),
		}
		return options;
	}

	private getTextAsWordArray(language: EnumLanguage, text: string): string[] {

		//0 - check if it is a special lang, which needs segmentation
		if (this.REQUIRE_SEGMENTATION.includes(language)) {
			return this.textSegmenter.segmentText(text, language);
		}

		//1 - split text by spaces to get words array
		return text.split(/\s/);
	}

	//** Filter Keywords */
	private filterKeywords(lowCasedWords: string[], unchangedWords: string[], options: IKeywordExtractOptions): string[] {

		//0 - checks the converted low words includes in the stop-words
		const results: string[] = [];
		let lastResultWordIndex: number = 0;
		for (let i = 0; i < lowCasedWords.length; i++) {

			//a. if the word is present in the stopwords then it will not added in the results
			if (this.stopWordsFilter.isStopWord(lowCasedWords[i], options.language)) continue;

			//b. if word doesn't exists in stop words
			const startResultWordIndex : number  = i;
			let unbrokenWordChain	   : boolean = false;
			if (lastResultWordIndex === (i-1)) {
				unbrokenWordChain = true;
			}

			//c. gets the result words from either low word or unconverted word based on the condition
			const resultWord = (options.caseInsensitive && !unchangedWords[i].match(/https?:\/\/.*[\r\n]*/g))
										? lowCasedWords[i]
										: unchangedWords[i];

			//d. check for ngram and if the element is removed then element position is adjusted
			if (options.maxNgrams && unbrokenWordChain && !options.chainedWords &&
				options.maxNgrams > i - startResultWordIndex && lastResultWordIndex === i-1) {

				const changePos: number = (results.length-1 < 0) ? 0 : results.length-1;
				results[changePos] = results[changePos]
										? results[changePos] + ' ' + resultWord
										: resultWord;
				continue;
			}

			if (options.chainedWords && lastResultWordIndex === i-1) {
				const changePos: number = (results.length-1 < 0) ? 0 : results.length-1;
				results[changePos] = results[changePos]
										? results[changePos] + ' ' + resultWord
										: resultWord;
				continue;
			}

			results.push(resultWord);
			lastResultWordIndex = i;
		}

		//1 - return the result
		return results;
	}

	private applySpecialOptions(words: string[], options: IKeywordExtractOptions): string[] {

		//0 - filter the results by occurrences
		const wordsList: {[key: string]: number} = {};				// count number of occurrences for each word
		for (const word of words) {
			wordsList[word] = (!wordsList[word]) ? 1 : ++wordsList[word];
		}
		words = words.filter((word: string) => wordsList[word] >= options.minOccurrenceCount);

		//1 - filters the array based on minimum length of the word
		if (options.minLength) words = words.filter((word: string) => word.length >= options.minLength);

		//2 - remove the excluded ones
		if (options.excludeWords.length !== 0) words = Util.Array.removeByValues(words, options.excludeWords);

		//3 - sort the array
		words = this.applySorting(words, options);

		//4 - remove duplicates is true then removes the duplicate values (has to be after sorting!)
		if (options.removeDuplicates) words = Util.Array.unique(words);

		//5 - shorten the result for maxResults
		words = words.slice(0, options.maxResults);

		return words;
	}

	private applySorting(words: string[], options: IKeywordExtractOptions): string[] {

		//0 - does any sorting have to be applied?
		if (options.sorting === EnumKeywordExtractSorting.None || words.length === 0) return words;

		//1 - apply the sorting
		switch (options.sorting) {
			case EnumKeywordExtractSorting.OccurrenceAsc:
				return Util.Keywords.sortByOccurrenceAsc(words, options.caseInsensitive);

			case EnumKeywordExtractSorting.OccurrenceDesc:
				return Util.Keywords.sortByOccurrenceDesc(words, options.caseInsensitive);

			case EnumKeywordExtractSorting.WordLengthAsc:
				return Util.Keywords.sortByLengthAsc(words);

			case EnumKeywordExtractSorting.WordLengthDesc:
				return Util.Keywords.sortByLengthDesc(words);

			case EnumKeywordExtractSorting.Random:
				return Util.Keywords.shuffle(words);

			case EnumKeywordExtractSorting.NonDeterministic:
				return Util.Keywords.randomizeNonDeterministic(words);

			default:
				throw new Error(`KeywordsExtractor => applySorting => FATAL ERROR: sorting of "${options.sorting}" is not supported (available sorting: ${Util.Enum.values(EnumKeywordExtractSorting)})`);
		}
	}
}
