//
//  XTHtmlCharEntityParser.m
//  XTads
//

#import "XTHtmlCharEntityParser.h"
#import "XTStringUtils.h"
#import "XTLogger.h"
#import "XTAllocDeallocCounter.h"


@interface XTHtmlCharEntityParser ()

typedef NS_ENUM(NSInteger, ExpanderState) {
	OUTSIDE_ENTITY,
	AFTER_AMPERSAND,
	AFTER_HASH,
	AFTER_HEX_MARK,
	IN_HEX_VALUE,
	IN_DECIMAL_VALUE,
	IN_NAMED_VALUE
};

@property ExpanderState state;
@property NSMutableString *charsConsumed;

@end


@implementation XTHtmlCharEntityParser

static XTLogger* logger;

static  NSCharacterSet *entityNameCharSet;
static  NSCharacterSet *decimalDigitCharSet;
static  NSCharacterSet *hexDigitCharSet;
static  NSDictionary *charByNamedEntity;

#define CLEAR_CONSUMED_CHARS [self.charsConsumed setString:@""];

#define EMIT_CONSUMED_CHARS \
	{ \
	[res appendString:self.charsConsumed]; \
	CLEAR_CONSUMED_CHARS; \
	self.state = OUTSIDE_ENTITY; \
	}

#define BACKTRACK \
	[XTStringUtils removeLastCharFrom:self.charsConsumed]; \
	index -= 1;

+ (void)initialize
{
	logger = [XTLogger loggerForClass:[XTHtmlCharEntityParser class]];

	entityNameCharSet = [NSCharacterSet letterCharacterSet];
	decimalDigitCharSet = [NSCharacterSet decimalDigitCharacterSet];
	hexDigitCharSet = [NSCharacterSet characterSetWithCharactersInString:@"01234567890abcdefABCDEF"];
	[XTHtmlCharEntityParser initCharByNamedEntity];
}

OVERRIDE_ALLOC_FOR_COUNTER
OVERRIDE_DEALLOC_FOR_COUNTER

- (id)init
{
    self = [super init];
    if (self) {
		_state = OUTSIDE_ENTITY;
		_charsConsumed = [NSMutableString stringWithCapacity:300];
    }
    return self;
}

- (NSString *)parse:(NSString *)string
{
	XT_DEF_SELNAME;

	NSMutableString *res = [NSMutableString stringWithCapacity:300];
	//ExpanderState oldState = self.state;

	for (NSUInteger index = 0; index < string.length; index++) {
		
		unichar ch = [string characterAtIndex:index];
		[self.charsConsumed appendFormat:@"%C", ch];
			//TODO try to make more efficient
	
		switch (self.state) {
			case OUTSIDE_ENTITY:
				if (ch == '&') {
					self.state = AFTER_AMPERSAND;
				} else {
					EMIT_CONSUMED_CHARS;
				}
				break;
			case AFTER_AMPERSAND:
				if (ch == '#') {
					self.state = AFTER_HASH;
				} else if ([entityNameCharSet characterIsMember:ch]) {
					self.state = IN_NAMED_VALUE;
				} else {
					BACKTRACK;
					EMIT_CONSUMED_CHARS;
				}
				break;
			case AFTER_HASH:
				if (ch == 'x') {
					self.state = AFTER_HEX_MARK;
				} else if ([decimalDigitCharSet characterIsMember:ch]) {
					self.state = IN_DECIMAL_VALUE;
				} else {
					BACKTRACK;
					EMIT_CONSUMED_CHARS;
				}
				break;
			case IN_NAMED_VALUE:
				if ([entityNameCharSet characterIsMember:ch]) {
					// self.state = IN_NAMED_VALUE
				} else if (ch == ';') {
					NSString *expanded = [self expandNamedEntity:self.charsConsumed];
					[res appendString:expanded];
					CLEAR_CONSUMED_CHARS;
					self.state = OUTSIDE_ENTITY;
				} else {
					BACKTRACK;
					NSString *expanded = [self expandNamedEntity:self.charsConsumed];
					[res appendString:expanded];
					CLEAR_CONSUMED_CHARS;
					self.state = OUTSIDE_ENTITY;
				}
				break;
			case AFTER_HEX_MARK:
				if ([hexDigitCharSet characterIsMember:ch]) {
					self.state = IN_HEX_VALUE;
				} else {
					BACKTRACK;
					EMIT_CONSUMED_CHARS;
					self.state = OUTSIDE_ENTITY;
				}
				break;
			case IN_HEX_VALUE:
				if ([hexDigitCharSet characterIsMember:ch]) {
					// state = IN_HEX_VALUE;
				} else if (ch == ';') {
					NSString *expanded = [self expandHexEntity:self.charsConsumed];
					[res appendString:expanded];
					CLEAR_CONSUMED_CHARS;
					self.state = OUTSIDE_ENTITY;
				} else {
					BACKTRACK;
					NSString *expanded = [self expandHexEntity:self.charsConsumed];
					[res appendString:expanded];
					CLEAR_CONSUMED_CHARS;
					self.state = OUTSIDE_ENTITY;
				}
				break;
			case IN_DECIMAL_VALUE:
				if ([decimalDigitCharSet characterIsMember:ch]) {
					// state = IN_DECIMAL_VALUE
				} else if (ch == ';') {
					NSString *expanded = [self expandDecimalEntity:self.charsConsumed];
					[res appendString:expanded];
					CLEAR_CONSUMED_CHARS;
					self.state = OUTSIDE_ENTITY;
				} else {
					BACKTRACK;
					NSString *expanded = [self expandDecimalEntity:self.charsConsumed];
					[res appendString:expanded];
					CLEAR_CONSUMED_CHARS;
					self.state = OUTSIDE_ENTITY;
				}
				break;
			default:
				XT_ERROR_1(@"unexpected state %lu", self.state);
				EMIT_CONSUMED_CHARS;
				self.state = OUTSIDE_ENTITY;
				break;
		}
	}

	//XT_TRACE_3(@"%@, state %ld -> %ld", string, oldState, self.state);

	return res;
}

//TODO unit test
- (NSString *)hardFlush
{
	//XT_DEF_SELNAME;

	//ExpanderState oldState = self.state;
	NSString *res;
	
	switch (self.state) {
		case IN_NAMED_VALUE:
			res = [self expandNamedEntity:self.charsConsumed];
			break;
		case IN_HEX_VALUE:
			res = [self expandHexEntity:self.charsConsumed];
			break;
		case IN_DECIMAL_VALUE:
			res = [self expandDecimalEntity:self.charsConsumed];
			break;
		default:
			res = [NSString stringWithString:self.charsConsumed];
			break;
	}

	CLEAR_CONSUMED_CHARS;
	self.state = OUTSIDE_ENTITY;

	//XT_TRACE_3(@"-> %@, state %ld -> %ld", res, oldState, self.state);

	return res;
}

- (void)reset
{
	CLEAR_CONSUMED_CHARS;
	self.state = OUTSIDE_ENTITY;
}

//TODO clean up :
- (NSString *)expandNamedEntity:(NSString *)entity
{
	// entity is: "&name[;]"
	
	XT_DEF_SELNAME;
	
	NSUInteger nameIndex = 1; // past &
	NSUInteger nameLen = entity.length - nameIndex;
	BOOL endedWithSemicolon = [XTStringUtils string:entity endsWithChar:';'];
	if (endedWithSemicolon) {
		nameLen -= 1;
	}
	NSRange nameRange = NSMakeRange(nameIndex, nameLen);
	NSString *name = [entity substringWithRange:nameRange];
	
	NSString *res = charByNamedEntity[name];
	BOOL excactMatch = (res != nil);
	
	if (! excactMatch) {
		// the name didn't exactly match a known entity, so try progressively shorter prefixes:
		NSUInteger nameLen = name.length;
		for (NSUInteger prefixLen = nameLen - 1; prefixLen >= 2; prefixLen--) {
			NSRange r = NSMakeRange(0, prefixLen);
			NSString *namePrefix = [name substringWithRange:r];
			NSString *expanded = charByNamedEntity[namePrefix];
			if (expanded != nil) {
				NSRange r = NSMakeRange(prefixLen, nameLen - prefixLen);
				NSString *remainder = [name substringWithRange:r];
				NSString *maybeSemicolon = (endedWithSemicolon ? @";" : @"");
				res = [NSString stringWithFormat:@"%@%@%@", expanded, remainder, maybeSemicolon];
				break;
			}
		}
	}
	if (res == nil) {
		NSString *maybeSemicolon = (endedWithSemicolon ? @";" : @"");
		res = [NSString stringWithFormat:@"&%@%@", name, maybeSemicolon];
		XT_WARN_1(@"unknown entity name \"%@\"", name);
	}
	return res;
}

- (NSString *)expandHexEntity:(NSString *)entity
{
	NSUInteger digitsIndex = 3; // past &#x
	NSUInteger digitsLen = entity.length - digitsIndex;
	BOOL endedWithSemicolon = [XTStringUtils string:entity endsWithChar:';'];
	if (endedWithSemicolon) {
		digitsLen -= 1;
	}
	NSRange digitsRange = NSMakeRange(digitsIndex, digitsLen);
	NSString *digits = [entity substringWithRange:digitsRange];
	NSScanner *scanner = [NSScanner scannerWithString:digits];
	unsigned int value;
	NSString *res = entity;
	if ([scanner scanHexInt:&value]) {
		if ([scanner isAtEnd]) {
			res = [self stringWithChar:(unichar)value];
		}
	}
	return res;
}

- (NSString *)expandDecimalEntity:(NSString *)entity
{
	NSUInteger digitsIndex = 2; // past &#
	NSUInteger digitsLen = entity.length - digitsIndex;
	BOOL endedWithSemicolon = [XTStringUtils string:entity endsWithChar:';'];
	if (endedWithSemicolon) {
		digitsLen -= 1;
	}
	NSRange digitsRange = NSMakeRange(digitsIndex, digitsLen);
	NSString *digits = [entity substringWithRange:digitsRange];
	NSScanner *scanner = [NSScanner scannerWithString:digits];
	NSInteger value;
	NSString *res = entity;
	if ([scanner scanInteger:&value]) {
		if ([scanner isAtEnd]) {
			res = [self stringWithChar:(unichar)value];
		}
	}
	return res;
}

- (NSString *)stringWithChar:(unichar)ch
{
	NSString *res = [NSString stringWithCharacters:&ch length:1];
	return res;
}

+ (void)initCharByNamedEntity
{
	/* See https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
	 
		tcols '-i;' '-o ' from html-entities-name-val.csv '$1..2' | tcols '-i ' '/@"/#$1#/" : @"\u/#$2#/",/' >raw-char-entitiy-map.txt
	 */
	
	//TODO get rid of warnings:
	NSMutableDictionary *mutableCharByNamedEntity = @{
	  // "late discoveries" supported by TADS VMs:
	  @"fpmsp" : @"\u2005",
	  @"figsp": @"\u2007",
	  @"hairsp": @"\u200a",
	  @"puncsp": @"\u2008",
	  @"spmsp": @"\u2006",
	  @"tpmsp": @"\u2004",
	  @"zwnbsp": @"\ufeff",
	  @"zwsp": @"\u200b",
	  // regular:
	  @"quot" : @"\"",
	  @"amp" : @"&",
	  @"apos" : @"'",
	  @"lt" : @"<",
	  @"gt" : @">",
	  @"nbsp" : @"\u00A0",
	  @"iexcl" : @"\u00A1",
	  @"cent" : @"\u00A2",
	  @"pound" : @"\u00A3",
	  @"curren" : @"\u00A4",
	  @"yen" : @"\u00A5",
	  @"brvbar" : @"\u00A6",
	  @"sect" : @"\u00A7",
	  @"uml" : @"\u00A8",
	  @"copy" : @"\u00A9",
	  @"ordf" : @"\u00AA",
	  @"laquo" : @"\u00AB",
	  @"not" : @"\u00AC",
	  @"shy" : @"\u00AD",
	  @"reg" : @"\u00AE",
	  @"macr" : @"\u00AF",
	  @"deg" : @"\u00B0",
	  @"plusmn" : @"\u00B1",
	  @"sup2" : @"\u00B2",
	  @"sup3" : @"\u00B3",
	  @"acute" : @"\u00B4",
	  @"micro" : @"\u00B5",
	  @"para" : @"\u00B6",
	  @"middot" : @"\u00B7",
	  @"cedil" : @"\u00B8",
	  @"sup1" : @"\u00B9",
	  @"ordm" : @"\u00BA",
	  @"raquo" : @"\u00BB",
	  @"frac14" : @"\u00BC",
	  @"frac12" : @"\u00BD",
	  @"frac34" : @"\u00BE",
	  @"iquest" : @"\u00BF",
	  @"Agrave" : @"\u00C0",
	  @"Aacute" : @"\u00C1",
	  @"Acirc" : @"\u00C2",
	  @"Atilde" : @"\u00C3",
	  @"Auml" : @"\u00C4",
	  @"Aring" : @"\u00C5",
	  @"AElig" : @"\u00C6",
	  @"Ccedil" : @"\u00C7",
	  @"Egrave" : @"\u00C8",
	  @"Eacute" : @"\u00C9",
	  @"Ecirc" : @"\u00CA",
	  @"Euml" : @"\u00CB",
	  @"Igrave" : @"\u00CC",
	  @"Iacute" : @"\u00CD",
	  @"Icirc" : @"\u00CE",
	  @"Iuml" : @"\u00CF",
	  @"ETH" : @"\u00D0",
	  @"Ntilde" : @"\u00D1",
	  @"Ograve" : @"\u00D2",
	  @"Oacute" : @"\u00D3",
	  @"Ocirc" : @"\u00D4",
	  @"Otilde" : @"\u00D5",
	  @"Ouml" : @"\u00D6",
	  @"times" : @"\u00D7",
	  @"Oslash" : @"\u00D8",
	  @"Ugrave" : @"\u00D9",
	  @"Uacute" : @"\u00DA",
	  @"Ucirc" : @"\u00DB",
	  @"Uuml" : @"\u00DC",
	  @"Yacute" : @"\u00DD",
	  @"THORN" : @"\u00DE",
	  @"szlig" : @"\u00DF",
	  @"agrave" : @"\u00E0",
	  @"aacute" : @"\u00E1",
	  @"acirc" : @"\u00E2",
	  @"atilde" : @"\u00E3",
	  @"auml" : @"\u00E4",
	  @"aring" : @"\u00E5",
	  @"aelig" : @"\u00E6",
	  @"ccedil" : @"\u00E7",
	  @"egrave" : @"\u00E8",
	  @"eacute" : @"\u00E9",
	  @"ecirc" : @"\u00EA",
	  @"euml" : @"\u00EB",
	  @"igrave" : @"\u00EC",
	  @"iacute" : @"\u00ED",
	  @"icirc" : @"\u00EE",
	  @"iuml" : @"\u00EF",
	  @"eth" : @"\u00F0",
	  @"ntilde" : @"\u00F1",
	  @"ograve" : @"\u00F2",
	  @"oacute" : @"\u00F3",
	  @"ocirc" : @"\u00F4",
	  @"otilde" : @"\u00F5",
	  @"ouml" : @"\u00F6",
	  @"divide" : @"\u00F7",
	  @"oslash" : @"\u00F8",
	  @"ugrave" : @"\u00F9",
	  @"uacute" : @"\u00FA",
	  @"ucirc" : @"\u00FB",
	  @"uuml" : @"\u00FC",
	  @"yacute" : @"\u00FD",
	  @"thorn" : @"\u00FE",
	  @"yuml" : @"\u00FF",
	  @"OElig" : @"\u0152",
	  @"oelig" : @"\u0153",
	  @"Scaron" : @"\u0160",
	  @"scaron" : @"\u0161",
	  @"Yuml" : @"\u0178",
	  @"fnof" : @"\u0192",
	  @"circ" : @"\u02C6",
	  @"tilde" : @"\u02DC",
	  @"Alpha" : @"\u0391",
	  @"Beta" : @"\u0392",
	  @"Gamma" : @"\u0393",
	  @"Delta" : @"\u0394",
	  @"Epsilon" : @"\u0395",
	  @"Zeta" : @"\u0396",
	  @"Eta" : @"\u0397",
	  @"Theta" : @"\u0398",
	  @"Iota" : @"\u0399",
	  @"Kappa" : @"\u039A",
	  @"Lambda" : @"\u039B",
	  @"Mu" : @"\u039C",
	  @"Nu" : @"\u039D",
	  @"Xi" : @"\u039E",
	  @"Omicron" : @"\u039F",
	  @"Pi" : @"\u03A0",
	  @"Rho" : @"\u03A1",
	  @"Sigma" : @"\u03A3",
	  @"Tau" : @"\u03A4",
	  @"Upsilon" : @"\u03A5",
	  @"Phi" : @"\u03A6",
	  @"Chi" : @"\u03A7",
	  @"Psi" : @"\u03A8",
	  @"Omega" : @"\u03A9",
	  @"alpha" : @"\u03B1",
	  @"beta" : @"\u03B2",
	  @"gamma" : @"\u03B3",
	  @"delta" : @"\u03B4",
	  @"epsilon" : @"\u03B5",
	  @"zeta" : @"\u03B6",
	  @"eta" : @"\u03B7",
	  @"theta" : @"\u03B8",
	  @"iota" : @"\u03B9",
	  @"kappa" : @"\u03BA",
	  @"lambda" : @"\u03BB",
	  @"mu" : @"\u03BC",
	  @"nu" : @"\u03BD",
	  @"xi" : @"\u03BE",
	  @"omicron" : @"\u03BF",
	  @"pi" : @"\u03C0",
	  @"rho" : @"\u03C1",
	  @"sigmaf" : @"\u03C2",
	  @"sigma" : @"\u03C3",
	  @"tau" : @"\u03C4",
	  @"upsilon" : @"\u03C5",
	  @"phi" : @"\u03C6",
	  @"chi" : @"\u03C7",
	  @"psi" : @"\u03C8",
	  @"omega" : @"\u03C9",
	  @"thetasym" : @"\u03D1",
	  @"upsih" : @"\u03D2",
	  @"piv" : @"\u03D6",
	  @"ensp" : @"\u2002",
	  @"emsp" : @"\u2003",
	  @"thinsp" : @"\u2009",
	  @"zwnj" : @"\u200C",
	  @"zwj" : @"\u200D",
	  @"lrm" : @"\u200E",
	  @"rlm" : @"\u200F",
	  @"ndash" : @"\u2013",
	  @"mdash" : @"\u2014",
	  @"lsquo" : @"\u2018",
	  @"rsquo" : @"\u2019",
	  @"sbquo" : @"\u201A",
	  @"ldquo" : @"\u201C",
	  @"rdquo" : @"\u201D",
	  @"bdquo" : @"\u201E",
	  @"dagger" : @"\u2020",
	  @"Dagger" : @"\u2021",
	  @"bull" : @"\u2022",
	  @"hellip" : @"\u2026",
	  @"permil" : @"\u2030",
	  @"prime" : @"\u2032",
	  @"Prime" : @"\u2033",
	  @"lsaquo" : @"\u2039",
	  @"rsaquo" : @"\u203A",
	  @"oline" : @"\u203E",
	  @"frasl" : @"\u2044",
	  @"euro" : @"\u20AC",
	  @"image" : @"\u2111",
	  @"weierp" : @"\u2118",
	  @"real" : @"\u211C",
	  @"trade" : @"\u2122",
	  @"alefsym" : @"\u2135",
	  @"larr" : @"\u2190",
	  @"uarr" : @"\u2191",
	  @"rarr" : @"\u2192",
	  @"darr" : @"\u2193",
	  @"harr" : @"\u2194",
	  @"crarr" : @"\u21B5",
	  @"lArr" : @"\u21D0",
	  @"uArr" : @"\u21D1",
	  @"rArr" : @"\u21D2",
	  @"dArr" : @"\u21D3",
	  @"hArr" : @"\u21D4",
	  @"forall" : @"\u2200",
	  @"part" : @"\u2202",
	  @"exist" : @"\u2203",
	  @"empty" : @"\u2205",
	  @"nabla" : @"\u2207",
	  @"isin" : @"\u2208",
	  @"notin" : @"\u2209",
	  @"ni" : @"\u220B",
	  @"prod" : @"\u220F",
	  @"sum" : @"\u2211",
	  @"minus" : @"\u2212",
	  @"lowast" : @"\u2217",
	  @"radic" : @"\u221A",
	  @"prop" : @"\u221D",
	  @"infin" : @"\u221E",
	  @"ang" : @"\u2220",
	  @"and" : @"\u2227",
	  @"or" : @"\u2228",
	  @"cap" : @"\u2229",
	  @"cup" : @"\u222A",
	  @"int" : @"\u222B",
	  @"there4" : @"\u2234",
	  @"sim" : @"\u223C",
	  @"cong" : @"\u2245",
	  @"asymp" : @"\u2248",
	  @"ne" : @"\u2260",
	  @"equiv" : @"\u2261",
	  @"le" : @"\u2264",
	  @"ge" : @"\u2265",
	  @"sub" : @"\u2282",
	  @"sup" : @"\u2283",
	  @"nsub" : @"\u2284",
	  @"sube" : @"\u2286",
	  @"supe" : @"\u2287",
	  @"oplus" : @"\u2295",
	  @"otimes" : @"\u2297",
	  @"perp" : @"\u22A5",
	  @"sdot" : @"\u22C5",
	  @"vellip" : @"\u22EE",
	  @"lceil" : @"\u2308",
	  @"rceil" : @"\u2309",
	  @"lfloor" : @"\u230A",
	  @"rfloor" : @"\u230B",
	  @"lang" : @"\u2329",
	  @"rang" : @"\u232A",
	  @"loz" : @"\u25CA",
	  @"spades" : @"\u2660",
	  @"clubs" : @"\u2663",
	  @"hearts" : @"\u2665",
	  @"diams" : @"\u2666",
	  
	  // non-std / legacy:
	  @"emdash" : @"\u2014",
	  @"endash" : @"\u2013",
	  @"lsq" : @"\u2018",
	  @"rsq" : @"\u2019",
	  @"ldq" : @"\u201C",
	  @"rdq" : @"\u201D",

	  //From http://www.tads.org/t3doc/doc/htmltads/latin2.htm :
	  @"Aogon" : @"\u0104",
	  @"breve" : @"\u02d8",
	  @"Lstrok" : @"\u0141",
	  @"Lcaron" : @"\u013d",
	  @"Sacute" : @"\u015a",
	  @"Scaron" : @"\u0160",
	  @"Scedil" : @"\u015e",
	  @"Tcaron" : @"\u0164",
	  @"Zacute" : @"\u0179",
	  @"Zcaron" : @"\u017d",
	  @"Zdot" : @"\u017b",
	  @"aogon" : @"\u0105",
	  @"ogon" : @"\u02db",
	  @"lstrok" : @"\u0142",
	  @"lcaron" : @"\u013e",
	  @"sacute" : @"\u015b",
	  @"caron" : @"\u02c7",
	  @"scaron" : @"\u0161",
	  @"scedil" : @"\u015f",
	  @"tcaron" : @"\u0165",
	  @"zacute" : @"\u017a",
	  @"dblac" : @"\u02dd",
	  @"zcaron" : @"\u017e",
	  @"zdot" : @"\u017c",
	  @"Racute" : @"\u0154",
	  @"Abreve" : @"\u0102",
	  @"Lacute" : @"\u0139",
	  @"Cacute" : @"\u0106",
	  @"Ccaron" : @"\u010c",
	  @"Eogon" : @"\u0118",
	  @"Ecaron" : @"\u011a",
	  @"Dcaron" : @"\u010e",
	  @"Dstrok" : @"\u0110",
	  @"Nacute" : @"\u0143",
	  @"Ncaron" : @"\u0147",
	  @"Odblac" : @"\u0150",
	  @"Rcaron" : @"\u0158",
	  @"Uring" : @"\u016e",
	  @"Udblac" : @"\u0170",
	  @"Tcedil" : @"\u0162",
	  @"racute" : @"\u0155",
	  @"abreve" : @"\u0103",
	  @"lacute" : @"\u013a",
	  @"cacute" : @"\u0107",
	  @"ccaron" : @"\u010d",
	  @"eogon" : @"\u0119",
	  @"ecaron" : @"\u011b",
	  @"dcaron" : @"\u010f",
	  @"dstrok" : @"\u0111",
	  @"nacute" : @"\u0144",
	  @"ncaron" : @"\u0148",
	  @"odblac" : @"\u0151",
	  @"rcaron" : @"\u0159",
	  @"uring" : @"\u016f",
	  @"udblac" : @"\u0171",
	  @"tcedil" : @"\u0163",
	  @"dot" : @"\u02d9"
	};
	
	charByNamedEntity = mutableCharByNamedEntity;
}

@end
