#include "FGen.h"
#include <iterator>
#include <sstream>
using namespace std;

#include "StringMap.h"
#include "Options.h"
#include "CharUtils.h"
#include "CppAssert.h"
using namespace egstra;

#define CONF_DEFAULT "p|pu|pb|pch"

namespace dparser {
	void FGen::process_options()
	{
		int tmp;
        string strtmp;

        _chartype_file = "chartype.txt";
		if(options::get("chartype-file", strtmp)) {
			_chartype_file = strtmp;
		}

        load_chartype(); 

		_fcutoff = 1;
		if(options::get("fcutoff", tmp)) {
			_fcutoff = tmp;
		}

		_use_guide_postag = false;
		_use_guide_postag_prob = false;
		if(options::get("use-guide-postag", tmp)) {
			_use_guide_postag = (1 == tmp);
		}
		if (_use_guide_postag) {
			if(options::get("use-guide-postag-prob", tmp)) {
				_use_guide_postag_prob = (1 == tmp);
			}
		}

		_use_lexicon_feature = false;
		/* Add lexicon feature, created by chaojy*/
		if(options::get("use-lexicon-feature", tmp)){
			_use_lexicon_feature = tmp;
		}

	}

	void FGen::addPOSFeature_unigram( const Instance *inst, const int node_id, list<string> &feats_str, bool use_guide_pos /*= false*/ ) const
	{
		assert(node_id >= 0 && node_id <= inst->size());
		if (0 == node_id || inst->size() == node_id) return;

		string info = use_guide_pos ? inst->guide_postags[node_id] : "";
		const string prefix = "PU-" + info + FEAT_SEP;
		string feat;

        const string &c0 = inst->forms[node_id];
        const string &c_L1 = node_id > 1 ? inst->forms[node_id-1] : NO_FORM;
        const string &c_R1 = node_id < inst->size() - 1 ? inst->forms[node_id+1] : NO_FORM;
        const string &c_L2 = node_id > 2 ? inst->forms[node_id-2] : NO_FORM;
        const string &c_R2 = node_id < inst->size() - 2 ? inst->forms[node_id+2] : NO_FORM;

        string t0 = getCharType(c0);
        string t_L1 = (c_L1 == NO_FORM ? NO_FORM : getCharType(c_L1));
        string t_R1 = (c_R1 == NO_FORM ? NO_FORM : getCharType(c_R1));

        feat = prefix + "c0=" + c0;		    feats_str.push_back(feat);
        feat = prefix + "c_L1=" + c_L1;		    feats_str.push_back(feat);
        feat = prefix + "c_L2=" + c_L2;		    feats_str.push_back(feat);
        feat = prefix + "c_R1=" + c_R1;		    feats_str.push_back(feat);
        feat = prefix + "c_R2=" + c_R2;		    feats_str.push_back(feat);

        feat = prefix + "c_L21=" + c_L2 + FEAT_SEP + c_L1;  feats_str.push_back(feat);
        feat = prefix + "c_L10=" + c_L1 + FEAT_SEP + c0;    feats_str.push_back(feat);
        feat = prefix + "c_R01=" + c0   + FEAT_SEP + c_R1;    feats_str.push_back(feat);
        feat = prefix + "c_R12=" + c_R1 + FEAT_SEP + c_R2;    feats_str.push_back(feat);

        feat = prefix + "c_L20="  + c_L2 + FEAT_SEP + c0;    feats_str.push_back(feat);
        feat = prefix + "c_L1R1=" + c_L1 + FEAT_SEP + c_R1;    feats_str.push_back(feat);
        feat = prefix + "c_R02="  + c0   + FEAT_SEP + c_R2;    feats_str.push_back(feat);

        feat = prefix + "c_L210="  + c_L2 + FEAT_SEP + c_L1 + FEAT_SEP + c0;      feats_str.push_back(feat);
        feat = prefix + "c_L10R1=" + c_L1 + FEAT_SEP + c0   + FEAT_SEP + c_R1;    feats_str.push_back(feat);
        feat = prefix + "c_R012="  + c0   + FEAT_SEP + c_R1 + FEAT_SEP + c_R2;    feats_str.push_back(feat);


        feat = prefix + "t0=" + t0;		        feats_str.push_back(feat);
        feat = prefix + "t_L1=" + t_L1;		    feats_str.push_back(feat);
        feat = prefix + "t_R1=" + t_R1;		    feats_str.push_back(feat);

        feat = prefix + "t_L10=" + t_L1 + FEAT_SEP + t0;      feats_str.push_back(feat);
        feat = prefix + "t_R01=" + t0   + FEAT_SEP + t_R1;    feats_str.push_back(feat);

        feat = prefix + "t_L10R1=" + t_L1 + FEAT_SEP + t0   + FEAT_SEP + t_R1;    feats_str.push_back(feat);

        if (c_L2 == c0) { feat = prefix + "c_L2==c0";  feats_str.push_back(feat); }
        if (c_L1 == c0) { feat = prefix + "c_L1==c0";  feats_str.push_back(feat); }
        if (c_R1 == c0) { feat = prefix + "c_R1==c0";  feats_str.push_back(feat); }
        if (c_R2 == c0) { feat = prefix + "c_R2==c0";  feats_str.push_back(feat); }
        if (c_L1 == c_R1) { feat = prefix + "c_L1==c_R1";  feats_str.push_back(feat); }

		if(_use_lexicon_feature){
			const string pprefix = "PL-"  + FEAT_SEP;
			const string &FB_c0 = inst->fbegin_vec[node_id];
			const string &FM_c0 = inst->fmiddle_vec[node_id];
			const string &FE_c0 = inst->fend_vec[node_id];

			const string &FB_L1 =  node_id > 1 ? inst->fbegin_vec[node_id-1] : "0";
			const string &FM_L1 =  node_id > 1 ? inst->fmiddle_vec[node_id-1] : "0";
			const string &FE_L1 =  node_id > 1 ? inst->fend_vec[node_id-1] : "0";

			const string &FB_R1 =  node_id < inst->size() - 1 ? inst->fbegin_vec[node_id+1] : "0";
			const string &FM_R1 =  node_id < inst->size() - 1 ? inst->fbegin_vec[node_id+1] : "0";
			const string &FE_R1 =  node_id < inst->size() - 1 ? inst->fbegin_vec[node_id+1] : "0";

			if(FB_c0 != "0") { feat = pprefix + "FB_c0=" + FB_c0;	feats_str.push_back(feat); }
			if(FM_c0 != "0") { feat = pprefix + "FM_c0=" + FM_c0;	feats_str.push_back(feat); }
			if(FE_c0 != "0") { feat = pprefix + "FE_c0=" + FE_c0;	feats_str.push_back(feat); }

			if(FB_L1 != "0") { feat = pprefix + "FB_L1=" + FB_L1;	feats_str.push_back(feat); }
			if(FM_L1 != "0") { feat = pprefix + "FM_L1=" + FM_L1;	feats_str.push_back(feat); }
			if(FE_L1 != "0") { feat = pprefix + "FE_L1=" + FE_L1;	feats_str.push_back(feat); }

			if(FB_R1 != "0") { feat = pprefix + "FB_R1=" + FB_R1;	feats_str.push_back(feat); }
			if(FM_R1 != "0") { feat = pprefix + "FM_R1=" + FM_R1;	feats_str.push_back(feat); }
			if(FE_R1 != "0") { feat = pprefix + "FE_R1=" + FE_R1;	feats_str.push_back(feat);	}

	/*		
			if ( FB_L1 != "0" || FB_c0 != "0" || FB_R1 != "0") {
				feat = pprefix + "FB_L1C0R1=" + FB_L1 + FEAT_SEP +FB_c0 + FEAT_SEP + FB_R1;	feats_str.push_back(feat);
			}

			if ( FM_L1 != "0" || FM_c0 != "0" || FM_R1 != "0") {
				feat = pprefix + "FB_L1C0R1=" + FM_L1 + FEAT_SEP +FM_c0 + FEAT_SEP + FM_R1;	feats_str.push_back(feat);
			}

			if ( FB_L1 != "0" || FB_c0 != "0" || FB_R1 != "0") {
				feat = pprefix + "FB_L1C0R1=" + FE_L1 + FEAT_SEP +FE_c0 + FEAT_SEP + FE_R1;	feats_str.push_back(feat);
			}
			
*/
		}

 }


	void FGen::addPOSFeature_unigram_guide( const Instance *inst, const int node_id, list<string> &feats_str, list<double> &probs ) const
	{
		assert(_use_guide_postag);
		assert(node_id >= 0 && node_id <= inst->size());
		if (0 == node_id || inst->size() == node_id) return;

		assert(probs.empty());
		assert(feats_str.empty());

		double prob = 1.0, prob_L1 = 1.0, prob_R1 = 1.0;
		if (_use_guide_postag_prob) {
			prob = inst->guide_probs[node_id];
			if (node_id > 1) prob_L1 = inst->guide_probs[node_id-1];
			if (node_id < inst->size() - 1) prob_R1 = inst->guide_probs[node_id+1];
		}
		
		addPOSFeature_unigram(inst, node_id, feats_str, true);
		probs.clear();
		probs.resize(feats_str.size(), prob);

		const string prefix = "PU-GP-";
		string feat;
		// guide tags
		const string &tg_L1 = node_id > 1 ? inst->guide_postags[node_id-1] : NO_CPOSTAG;
		const string &tg = inst->guide_postags[node_id];
		const string &tg_R1 = node_id < inst->size() - 1 ? inst->guide_postags[node_id+1] : NO_CPOSTAG;

		feat = prefix + "0=" + tg;												feats_str.push_back(feat); probs.push_back(prob);
		feat = prefix + "1=" + tg_L1;											feats_str.push_back(feat); probs.push_back(prob_L1);
		feat = prefix + "2=" + tg_R1;											feats_str.push_back(feat); probs.push_back(prob_R1);
		feat = prefix + "3=" + tg_L1 + FEAT_SEP + tg;							feats_str.push_back(feat); probs.push_back(min(prob, prob_L1));
		feat = prefix + "4=" + tg + FEAT_SEP + tg_R1;							feats_str.push_back(feat); probs.push_back(min(prob, prob_R1));
		feat = prefix + "5=" + tg_L1 + FEAT_SEP + tg_R1;						feats_str.push_back(feat); probs.push_back(min(prob_R1, prob_L1));
		feat = prefix + "6=" + tg_L1 + FEAT_SEP + tg + FEAT_SEP + tg_R1;		feats_str.push_back(feat); probs.push_back(min(prob,min(prob_R1, prob_L1)));
	}
	

	void FGen::addPOSFeature_bigram( const Instance *inst, const int node_id, const string &cpostag_L1, list<string> &feats_str ) const
	{
        assert(node_id >= 0 && node_id <= inst->size());
        string prefix = "PB-";
		string feat;

        const string &c0 = (node_id == inst->size() ? NO_FORM : inst->forms[node_id]);
        const string &c_L1 = node_id > 1 ? inst->forms[node_id-1] : NO_FORM;

        feat = prefix + "t_L1=" + cpostag_L1; feats_str.push_back(feat);
        feat = prefix + "t_L1+c0=" + cpostag_L1 + FEAT_SEP + c0; feats_str.push_back(feat);
        feat = prefix + "t_L1+c0+c_L1=" + cpostag_L1 + FEAT_SEP + c0 + FEAT_SEP + c_L1; feats_str.push_back(feat);
	}


	void FGen::usage(const char* const mesg) const {
		cerr << _name << " options:" << endl;
		cerr << " --fdictdir=<str> : pathname to feature-dictionary directory" << endl;
		cerr << " --fcutoff=<int>  : minimum feature count (default 1)" << endl;
		cerr << " --fconf=\"<flag>|...\" : feature-configuration flags" << endl;
		cerr << "     p    : pos features" << endl;
		cerr << "     pu   : pos unigram features" << endl;
		cerr << "     pb   : pos bigram features" << endl;
		cerr << "     pc   : pos char-based features" << endl;

		cerr << "   (default \"" << CONF_DEFAULT << "\")" << endl;
		cerr << endl;
		cerr << mesg << endl;
	}

	void FGen::dealloc_fvec_prob( Instance * const inst ) const
	{
		const int len = inst->size();
		for (int i = 0; i <= len; ++i) {
			(inst->fvec_unigram[i][0]).dealloc();
		}
		inst->fvec_unigram.dealloc();
		inst->prob_unigram.dealloc();

		for (int i = 0; i <= len; ++i) {
			for (int j = 0; j < tag_number(); ++j) {
				(inst->fvec_bigram[i][j][0]).dealloc();
			}
		}
		inst->fvec_bigram.dealloc();
		inst->prob_bigram.dealloc();
	}

	void FGen::create_all_feature_vectors( Instance * const inst )
	{
		const int len = inst->size();
		const int ntag = tag_number();
		list<string> feats_str;
		list<double> probs;
		inst->fvec_unigram.resize(len+1, ntag);
		inst->prob_unigram.resize(len+1, ntag);
		inst->fvec_unigram = fvec();
		inst->prob_unigram = DOUBLE_NEGATIVE_INFINITY;

		inst->fvec_bigram.resize(len+1, ntag, ntag);
		inst->prob_bigram.resize(len+1, ntag, ntag);
		inst->fvec_bigram = fvec();
		inst->prob_bigram = DOUBLE_NEGATIVE_INFINITY;


		for (int i = 0; i <= len; ++i) {
			for (int t = 0; t < ntag; ++t) {
				{
					const int offset = _pos_feat_offset + _pos_feat_dim * t;
					fvec * const fv = &inst->fvec_unigram[i][t];
					if (t == 0) {
						feats_str.clear();
						double prob = 1.;
						addPOSFeature_unigram(inst, i, feats_str, false);
						_pos_feat_dict.map_all(fv, offset, feats_str, _generation_mode);
						if (_use_guide_postag) {
							feats_str.clear();
							probs.clear();
							addPOSFeature_unigram_guide(inst, i, feats_str, probs);
							fvec fv2;
							_pos_feat_dict.map_all(&fv2, offset, feats_str, probs, _generation_mode);
							fv->append(fv2);
							fv2.dealloc();
						}
					} else {
						const fvec * const fv0 = &inst->fvec_unigram[i][0];
						fv->idx = fv0->idx;
						fv->n = fv0->n;
						fv->val = fv0->val;
						fv->offset = offset;
					}
				}
				for (int t0 = 0; t0 < ntag; ++t0) {
					const int offset = _pos_feat_offset + _pos_feat_dim * t0;
					fvec * const fv = &inst->fvec_bigram[i][t][t0]; // t: previous tag id
					if (t0 == 0) {
						feats_str.clear();
						addPOSFeature_bigram(inst, i, pos_id_2_str(t), feats_str);
						_pos_feat_dict.map_all(fv, offset, feats_str, _generation_mode);
					} else {
						const fvec * const fv0 = &inst->fvec_bigram[i][t][0];
						fv->idx = fv0->idx;
						fv->n = fv0->n;
						fv->val = fv0->val;
						fv->offset = offset;
					}
				}
			}
		}
	}

	void FGen::create_all_pos_features_according_to_tree(const Instance * const inst, sparsevec &sp_fv, const vector<string> &cpostags, const double scale /*= 1.0*/)
	{
		const int len = inst->size();
		fvec fv;
		list<string> feats_str;
		list<double> probs;		

		for (int i = 1; i <= len; ++i) {
			const int pos_id = (i == len ? pos_id_dummy() : get_pos_id(cpostags[i]));
			assert(pos_id >= 0);
			const int offset = _pos_feat_offset + _pos_feat_dim * pos_id;
			{
				feats_str.clear();
				addPOSFeature_unigram(inst, i, feats_str, false);
				_pos_feat_dict.map_all(&fv, offset, feats_str, _generation_mode);
				parameters::sparse_add(sp_fv, &fv, scale);
				fv.dealloc();
			}
			
			if (_use_guide_postag) {
				feats_str.clear();
				probs.clear();
				addPOSFeature_unigram_guide(inst, i, feats_str, probs);
				_pos_feat_dict.map_all(&fv, offset, feats_str, probs, _generation_mode);
				parameters::sparse_add(sp_fv, &fv, scale);
				fv.dealloc();
			}

			{
				feats_str.clear();
				addPOSFeature_bigram(inst, i, cpostags[i-1], feats_str);
				_pos_feat_dict.map_all(&fv, offset, feats_str, _generation_mode);
				parameters::sparse_add(sp_fv, &fv, scale);
				fv.dealloc();
			}
			
		}
	}

	void FGen::save_dictionaries( const string &dictdir ) /*const*/
	{
		assert(!_generation_mode);
		cerr << _name << " : saving feature dictionaries to \""
			<< dictdir << "\"" << endl;

		_word_dict.save(dictdir + "/word.dict.gz");
		_pos_dict.save(dictdir + "/pos.dict.gz");

		_pos_feat_dict.save(dictdir + "/pos.features.gz");
	}

	void FGen::load_dictionaries( const string &dictdir )
	{
		assert(!_generation_mode);
		cerr << _name << " : loading feature dictionaries from \""
			<< dictdir << "\""; print_time();

		_word_dict.load(dictdir + "/word.dict.gz", 0);
		_pos_dict.load(dictdir + "/pos.dict.gz", 0);
		_pos_id_2_str.resize(tag_number());
		_pos_id_2_str = NULL;
		_pos_dict.collect_keys(_pos_id_2_str.c_buf(), tag_number());

		_pos_feat_dict.load(dictdir + "/pos.features.gz", _fcutoff);

		_pos_feat_offset = 0;
		_pos_feat_dim = _pos_feat_dict.dimensionality();

		_total_feature_dim = _pos_feat_offset + _pos_feat_dim * tag_number();

		cerr << "word    number: " << _word_dict.dimensionality() << endl;
		cerr << "postag  number: " << _pos_dict.dimensionality() << endl;
		cerr << "pos     feature dimensionality: " << _pos_feat_dim << endl;
		cerr << "total   feature dimensionality: " << _total_feature_dim << endl;
		cerr << "pos     feature start   offset: " << _pos_feat_offset << endl;

		cerr << "\n done!"; print_time();
	}

	void FGen::collect_word_postag( Instance * const inst, const bool collect_word/*=false*/ )
	{
		assert(_generation_mode);
		const int length = inst->size();
		for (int i = 1; i < length; ++i) {
			if (collect_word) get_word_id(inst->forms[i]);
			get_pos_id(inst->cpostags[i]);
		}
	}

} // namespace gparser_space


