#include "IOPipe.h"
#include <iterator>

#include "CharUtils.h"

using namespace std;
using namespace egstra;


namespace dparser {

	void IOPipe::preprocessInstance( Instance *inst)
	{
		const int length = inst->size();

		if (_english) {
			inst->contain_hyphen.resize(length);
			inst->contain_number.resize(length);
			inst->contain_uppercase_char.resize(length);
		} else {
			inst->chars.resize(length);
		}

		if(_use_lexicon_feature){			
			set_lexicon_feature_list(inst);
		}

		if (_use_guide_postag) {
			inst->guide_postags.resize(0);
			inst->guide_postags.resize(length);
			inst->guide_postags[0] = NO_CPOSTAG;
			
			if (_use_guide_postag_prob) {
				inst->guide_probs.clear();
				inst->guide_probs.resize(length, 1.);
			}
		}
		
		if (_constrained_tag) {
			inst->constrained_tags_str.resize(0);
			inst->constrained_tags_str.resize(length);
		}
		for (int i = 1; i < length; ++i) {
			if (_copy_cpostag_from_postag) {
				inst->cpostags[i] = inst->postags[i];
			}

			if (_english) {
				const string &form = inst->forms[i];
				inst->contain_hyphen[i] = contain_hyphen(form) ? "hyp=y" : "hyp=n";
				inst->contain_number[i] = contain_number(form) ? "num=y" : "num=n";
				inst->contain_uppercase_char[i] = contain_uppercase_character(form) ? "upc=y" : "upc=n";
			} else {
				//getCharactersFromUTF8String(inst->forms[i], inst->chars[i]);
			}

			if (_use_guide_postag) {
				inst->guide_postags[i] = inst->postags[i];
				if (_use_guide_postag_prob) {
					const string &prob = inst->orig_feats[i];
					if ( !(prob == "" || prob == "_" || prob[0] < '0' || prob[0] > '9') ) {
						inst->guide_probs[i] = toDouble(prob);
					}
				}
			}
			if (_constrained_tag) {
				vector<string> vec;
				simpleTokenize(inst->pdeprels[i], vec, "_");
				if (!vec.empty()) {
					inst->constrained_tags_str[i] = vec;
				}
			}


		}
	}

	void IOPipe::load_lexicon(const string &file) {
		cerr << "loading lexicon  from \""
			<< file << "\""; print_time();

		_lexicon.clear();
		_maxwordlen = 0;

		char filename[100];
		strcpy(filename, file.c_str());
		ifstream inf;
		inf.open(filename);

		if (!inf.is_open()) {
			cerr << "Open Lexicon File Error:" << filename << endl;
			exit(-1);
		 }

		string line;
		stringstream ss;
		string word;

		while (getline(inf, line)) {
			ss.clear();
			ss.str(line);
			ss >> word ;
			_lexicon[word] = 1;
			vector<string> chars;
			int len = getCharactersFromUTF8String(word, chars);
			if( len > _maxwordlen){
				_maxwordlen= len;
			}
		} 

	
		cerr << "Max word length : " << _maxwordlen  << endl; 
		cerr << "Total words number : " << _lexicon.size()  << " words" << endl; 
		cerr << "Load lexicon done. " << endl;
		
		inf.close();
	}

	
	void IOPipe::set_lexicon_feature_list(Instance *inst){
		int len = inst->size();

		inst->fbegin_vec.resize(len);
		inst->fend_vec.resize(len);
		inst->fmiddle_vec.resize(len);

		inst->fbegin_vec = "0";
		inst->fend_vec = "0";
		inst->fmiddle_vec = "0";

		for (int node_id = 1; node_id < inst->size(); ++node_id){

			int maxlen = _maxwordlen;  // word max len ?
			while(maxlen){
				stringstream maxlen_ss;
				maxlen_ss << maxlen;
				string word;

				int end_id = node_id + maxlen - 1;
				if (node_id > (inst->size() - maxlen)){
					maxlen--;
					continue;
				}
			
				get_word(inst, node_id, end_id, word);
				map<string, int>::const_iterator it = _lexicon.find(word);
				if (it != _lexicon.end()){  // 如果在lexicon当中找到了当前词
				
					inst->fbegin_vec[node_id] = maxlen_ss.str();
					inst->fend_vec[end_id] = maxlen_ss.str();
				
					if (maxlen > 2){
						for (int i = node_id + 1; i < end_id ; i++) {
							if (maxlen > atoi(inst->fmiddle_vec[i].c_str() ) ) 
								inst->fmiddle_vec[i] = maxlen_ss.str();
						}
					}
					break;
				}
				else {
					maxlen--;
					continue;
				}
			}
		}
	}
	

	void IOPipe::getInstancesFromInputFile( const int startId /*= 0*/, const int maxInstNum/*=-1*/, const int instMaxLen/*=-1*/ )
	{
		cerr << "Get all instances from " << m_inf_name; print_time();
		dealloc_instance();

		if (_use_lexicon_feature) {
			dealloc_lexicon();	
			load_lexicon(_lexicon_path);
		}
		_start_id = startId;

		int inst_thrown_ctr = 0;
		while (1) {
			const size_t this_posi = _inf_current_posi;
			const int this_id = startId + getInstanceNum();

			Instance * const inst = m_reader->getNext(this_id, _inf_current_posi);
			if (!inst) break;
			if (inst->forms.size() != inst->cpostags.size()) {
				cerr << "[BF " << inst_thrown_ctr++ << ":" << inst->size() << "] "; // Wenliang's data
				delete inst;
				continue;
			}

			if (instMaxLen > 0 && inst->size() > instMaxLen) { // to be consistent with the old version.
				cerr << " [" << inst_thrown_ctr++ << ":" << inst->size() << "] ";
				delete inst;
			} else {
				if (_use_instances_posi) {
					delete inst;
					m_instances_posi.push_back(this_posi);
				} else {
					m_instances.push_back(inst);
					preprocessInstance(inst);
				}
			}

			if (maxInstNum > 0 && getInstanceNum() == maxInstNum) break;
		}
		
		fillVecInstIdxToRead();


		cerr << "\ninstance num: " << getInstanceNum() << endl;
		cerr << "Done!"; print_time();
	}
}


