#ifndef _FEATURE_EXTRACTER_
#define _FEATURE_EXTRACTER_

#pragma once

#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>
#include <string>
#include <list>
#include <map>
using namespace std;

#include "Instance.h"
#include "common.h"
#include "StringMap.h"
#include "CharUtils.h"

#include "NRMat.h"
using namespace nr;

#include "FVec.h"
#include "FeatureDictionary.h"
using namespace egstra;

namespace dparser {
	class FGen
	{
	private:
		int _total_feature_dim;
		FeatureDictionary _word_dict;
		FeatureDictionary _pos_dict;
		NRVec<const char *> _pos_id_2_str;

		FeatureDictionary _pos_feat_dict;
		int _pos_feat_offset;
		int _pos_feat_dim;

        map<string, string> _chartype;
        string _chartype_file;

	private:
		string _name;
		bool _generation_mode;

	private: // options
		int _fcutoff; // only use features with freq >= _fcutoff 
		bool _use_guide_postag;
		bool _use_guide_postag_prob;
		bool _use_lexicon_feature;

	public:
		FGen() {
			_name = "FGen";
			_generation_mode = false;

			_pos_feat_offset = 0;
			_pos_feat_dim = 0;
		}

		~FGen() {}
		void add_feature_frequency(const int freq) {
			_pos_feat_dict.add_frequency(freq);
		}

		void process_options();
		void start_generation_mode() { _generation_mode = true; }
		void stop_generation_mode() { _generation_mode = false; }

		void dealloc_fvec_prob(Instance * const inst) const;
		void create_all_feature_vectors(Instance * const inst);
		void create_all_pos_features_according_to_tree(const Instance * const inst, sparsevec &sp_fv, const vector<string> &cpostags, const double scale = 1.0);

		void save_dictionaries(const string &dictdir) /*const*/;
		void load_dictionaries(const string &dictdir);

		int feature_dimentionality() const {
			return _total_feature_dim;
		}
		int tag_number() const {
			return _pos_dict.dimensionality();
		}
		void collect_word_postag( Instance * const inst, const bool collect_word=false); // when creating dictionaries,  collect word/postag
		int get_word_id(const string &word) {
			return _word_dict.getFeature(word, _generation_mode);
		}
		int get_pos_id(const string &pos) {
			const int id = _pos_dict.getFeature(pos, _generation_mode);
			if (id < 0) {
				cerr << "unknown pos type: " << pos << endl;
				exit(-1);
			}
			return id;		
		}
		int pos_id_dummy() {
			return get_pos_id(NO_CPOSTAG);
		}
		const char *pos_id_2_str(const int pos_id) const {
			assert(pos_id >= 0 && pos_id < _pos_id_2_str.size());
			return _pos_id_2_str[pos_id];
		}
		void assign_predicted_tag_str(Instance * const inst) {
			const int len = inst->size();
			inst->predicted_postags.resize(len);
			for (int i = 0; i < len; ++i) {
				inst->predicted_postags[i] = pos_id_2_str(inst->predicted_pos_ids[i]);
			}
		}

		void assign_filtered_tag_str(Instance * const inst) {
			const int len = inst->size();
			inst->filtered_tags.resize(len);
			for (int i = 0; i < len; ++i) {
				const vector<int> &ids = inst->filtered_tags_id[i];
				const int tagnum = ids.size();
				inst->filtered_tags[i].resize(tagnum);
				for (int j = 0; j < tagnum; ++j)
					inst->filtered_tags[i][j] = pos_id_2_str(ids[j]);
			}
		}
		
		void create_constrained_tag_matrix(Instance * const inst) {
			assert(!inst->constrained_tags_str.empty());
			inst->constrained_tags_str[0].clear();
			inst->constrained_tags_str[0].push_back(inst->cpostags[0]);

			const int len = inst->size();
			inst->constrained_tags.resize(len, tag_number());
			inst->constrained_tags = false;
			for (int i = 0; i < len; ++i) {
				const vector<string> &tags = inst->constrained_tags_str[i];
				if (tags.empty()) {
					for (int ti = 0; ti < tag_number(); ++ti) {
						inst->constrained_tags[i][ti] = true;
					}
				} else {
					for (int ti = 0; ti < tags.size(); ++ti) {
						inst->constrained_tags[i][ get_pos_id(tags[ti]) ] = true;
					}
				}
			}
		}
		
	private:
		void addPOSFeature_unigram(const Instance *inst, const int node_id, list<string> &feats_str, bool use_guide_pos = false) const;
		void addPOSFeature_unigram_guide(const Instance *inst, const int node_id, list<string> &feats_str, list<double> &probs) const;
		void addPOSFeature_bigram( const Instance *inst, const int node_id, const string &cpostag_L1, list<string> &feats_str ) const;

        string getCharType(const string &str) const {
            const map<string, string>::const_iterator it = _chartype.find(str);
            if (it != _chartype.end()) {
                return it->second;
            }
            if (str.size() == 1) {
                if (str[0] >= '0' && str[0] <= '9') return "en-num";
                if ((str[0] >= 'a' && str[0] <= 'z') || (str[0] >= 'A' && str[0] <= 'Z')) return "en-aph";
                return "en-other";
            } else {
                return "non-en";
            }
        }

        void load_chartype() {
            _chartype.clear();
            ifstream inf;
            inf.open(_chartype_file.c_str(), std::ios::binary);
            if (!inf.is_open()) {
                cerr << "open chartype file error: " << _chartype_file << endl;
                exit(-1);
            }
            string line;
            while (egstra::my_getline(inf, line)) {
                if (line.empty()) continue;
                vector<string> vec;
                egstra::simpleTokenize(line, vec, " \t");
                if (vec.size() < 2) {
                    cerr << "\nInvalide line in chartype file: " << line << endl;
                    continue;
                }
                for (int i = 1; i < vec.size(); ++i) {
                    _chartype[vec[i]] = vec[0];
                    cerr << vec[i] << " " << vec[0] << " | ";
                }
            }

            cerr << endl;
            inf.close();
        }

		void usage(const char * const mesg) const;
	};
} // namespace gparser_space

#endif


