#include "Parser.h"
#include <cstdio>
#include <iomanip>
#include <ctime>
#include <cfloat>
using namespace std;


namespace dparser {

	sp_thread_mutex_t Parser::_mutex;
	sp_thread_cond_t Parser::_cond_waiting_create_feat;	    
	sp_thread_cond_t Parser::_cond_waiting_update;		    
	sp_thread_cond_t Parser::_cond_done_update;

	vector<bool> Parser::_train_features_created;
	int Parser::_train_create_feat_inst_i;
	int Parser::_train_update_inst_i;

	double Parser::_sum_loss, Parser::_t0, Parser::_t;
	double Parser::_lambda, Parser::_eta, Parser::_decay, Parser::_gain;
	vector<double> Parser::_g;
	bool Parser::_mbr_decoding;
	bool Parser::_test_tag_filter;
	double Parser::_test_tag_filter_lambda;

	bool Parser::_gradient_update_allow_conflict;

	void Parser::process_options()
	{
		m_pipe_train.process_options();
		m_pipe_train2.process_options();
		m_pipe_train3.process_options();
		m_pipe_test.process_options();
		m_pipe_dev.process_options();
		m_pipe_dev2.process_options();
		m_fgen.process_options();

		_train = false;
		_test = false;
		_inst_max_len_to_throw = 150;
		_inst_max_num_eval = -1;
		_inst_max_num_train = -1;
		_inst_max_num_train2 = -1;
		_inst_max_num_train3 = -1;
		_test_batch_size = 10000;


		_display_interval = 100;
		_verify_decoding_algorithm = true;

		_dictionary_path = ".";
		_parameter_path = ".";		

		_self_training = false;
		_use_train2 = false;
		_use_train3 = false;
		_filename_train2 = "";
		_filename_train3 = "";
		_inst_num_from_train3_one_iter = -1;
		_inst_num_from_train2_one_iter = -1;
		_inst_num_from_train1_one_iter = -1;
		_inst_max_num_train2 = -1;
		_inst_max_num_train3 = -1;
		
		_filename_train = "";
		_filename_dev = "";
		_filename_dev2 = "";
		_iter_num = 20;
		_use_dev2 = false;

		_dictionary_exist = false;
		_pamameter_exist = false;
		_param_tmp_num = -1;

		_filename_test = "";
		_filename_output = "";
		_param_num_for_eval = -1;

		int tmp; string strtmp;	double dtmp;

		_thread_num = 5;
		if (options::get("thread-num", tmp)) {	assert(tmp > 0); _thread_num = tmp;	}
		if (options::get("train", tmp)) {
			_train = tmp;
		}
		if (options::get("test", tmp)) {
			_test = tmp;
		}
		
		if (_train) {
			if(options::get("train-file", strtmp)) {
				_filename_train = strtmp;
			}
			if(options::get("dev-file", strtmp)) {
				_filename_dev = strtmp;
			}
			if(options::get("iter-num", tmp)) {
				_iter_num = tmp;
			}

			if (options::get("constrained-tag-train1", tmp)) {
				m_pipe_train.set_constrained_tag_flag( (1 == tmp) );
			}
			if(options::get("inst-num-from-train-1-one-iter", tmp)) {
                _inst_num_from_train1_one_iter = tmp;
            }
			if (options::get("use-train-2", tmp)) {
				_use_train2 = (1 == tmp);
			}
			if (_use_train2) {
				if (options::get("constrained-tag-train2", tmp)) {
					m_pipe_train2.set_constrained_tag_flag( (1 == tmp) );
				}
				if(options::get("train-file-2", strtmp)) {
					_filename_train2 = strtmp;
				}
				if(options::get("inst-num-train-2", tmp)) {
					_inst_max_num_train2 = tmp;
				}
				if(options::get("inst-num-from-train-2-one-iter", tmp)) {
					_inst_num_from_train2_one_iter = tmp;
				}
			}
			if (options::get("use-train-3", tmp)) {
				_use_train3 = (1 == tmp);
			}
			if (_use_train3) {
				if (options::get("constrained-tag-train3", tmp)) {
					m_pipe_train3.set_constrained_tag_flag( (1 == tmp) );
				}
				if(options::get("train-file-3", strtmp)) {
					_filename_train3 = strtmp;
				}
				if(options::get("inst-num-train-3", tmp)) {
					_inst_max_num_train3 = tmp;
				}
				if(options::get("inst-num-from-train-3-one-iter", tmp)) {
					_inst_num_from_train3_one_iter = tmp;
				}
			}
			if (options::get("use-dev-2", tmp)) {
				_use_dev2 = (1 == tmp);
			}
			if (_use_dev2) {
				if(options::get("dev-file-2", strtmp)) {
					_filename_dev2 = strtmp;
				}
			}
		}

		_test_tag_filter = false;
		_test_tag_filter_lambda = 1e-3;
		if (_test) {
			if (options::get("constrained-tag-test", tmp)) {
				m_pipe_test.set_constrained_tag_flag( (1 == tmp) );
			}

			if (options::get("test-tag-filter", tmp)) {
				_test_tag_filter = (1 == tmp);
			}
			if (_test_tag_filter) {
				if (options::get("test-tag-filter-lambda", dtmp)) {
					_test_tag_filter_lambda = dtmp;
				}
			}

			if (options::get("test-batch-size", tmp)) {
				_test_batch_size = tmp;
			}
		}

		if(options::get("inst-max-len-to-throw", tmp)) {
			_inst_max_len_to_throw = tmp;
		}

		if(options::get("inst-max-num-train", tmp)) {
			_inst_max_num_train = tmp;
		}
		if(options::get("inst-max-num-eval", tmp)) {
			_inst_max_num_eval = tmp;
		}

		if(options::get("display-interval", tmp)) {
			_display_interval = tmp;
		}

		if(options::get("dictionary-path", strtmp)) {
			_dictionary_path = strtmp;
		}
		if(options::get("parameter-path", strtmp)) {
			_parameter_path = strtmp;
		}  

		if(options::get("dictionary-exist", tmp)) {
			_dictionary_exist = tmp;
		}

		if(options::get("parameter-exist", tmp)) {
			_pamameter_exist = tmp;
		}
		if(options::get("param-tmp-num", tmp)) {
			_param_tmp_num = tmp;
            if (_param_tmp_num <= 0) _param_tmp_num = 1;
		}


		if(options::get("test-file", strtmp)) {
			_filename_test = strtmp;
		}
		if(options::get("output-file", strtmp)) {
			_filename_output = strtmp;
		}
		if(options::get("param-num-for-eval", tmp)) {
			_param_num_for_eval = tmp;
		}

		_train_method = "l2sgd";
		if(options::get("train-method", strtmp)) {
			_train_method = strtmp;
		}
	
		_gradient_update_allow_conflict = true;
		if(options::get("gradient-update-allow-conflict", tmp)) {
			_gradient_update_allow_conflict = (1 == tmp);
		}

		_lbfgs_crfpp_opt.isL2 = true;
		_lbfgs_crfpp_opt.c = 1.0;
		_lbfgs_crfpp_opt.eta = 0.0001;
		_lbfgs_crfpp_opt.shrinking_size = 20;
		if (_train_method == "lbfgs-crfpp") {
			if (options::get("lbfgs-crfpp-isL2", tmp)) _lbfgs_crfpp_opt.isL2 = (tmp == 1);
			if (options::get("lbfgs-crfpp-c", dtmp)) _lbfgs_crfpp_opt.c = dtmp;
			if (options::get("lbfgs-crfpp-eta", dtmp)) _lbfgs_crfpp_opt.eta = dtmp;
			if (options::get("lbfgs-crfpp-shrinking-size", tmp)) _lbfgs_crfpp_opt.shrinking_size = tmp;
			cerr << "*** lbfgs-crfpp train options: " 
				<< _lbfgs_crfpp_opt.isL2 << "-"
				<< _lbfgs_crfpp_opt.c << "-"
				<< _lbfgs_crfpp_opt.eta << "-"
				<< _lbfgs_crfpp_opt.shrinking_size
				<< endl;
		}

		_l2sgd_opt.batch_size = 20;
		_l2sgd_opt.c2 = 0.1;
		_l2sgd_opt.period = 10;
		_l2sgd_opt.delta = 1e-6;
		_l2sgd_opt.calibration_eta = 0.1;
		_l2sgd_opt.calibration_rate = 2.;
		_l2sgd_opt.calibration_samples = 1000;
		_l2sgd_opt.calibration_candidates = 10;
		_l2sgd_opt.calibration_max_trials = 20;
		if (_train_method == "l2sgd") {
			if (options::get("l2sgd-batch-size", tmp)) { _l2sgd_opt.batch_size = tmp; assert(tmp >= 1); }
			if (options::get("l2sgd-c2", dtmp))	_l2sgd_opt.c2 = dtmp;
			if (options::get("l2sgd-period", tmp)) _l2sgd_opt.period = tmp;
			if (options::get("l2sgd-delta", dtmp)) _l2sgd_opt.delta = dtmp;
			if (options::get("l2sgd-calibration-eta", dtmp)) _l2sgd_opt.calibration_eta = dtmp;
			if (options::get("l2sgd-calibration-rate", dtmp)) _l2sgd_opt.calibration_rate = dtmp;
			if (options::get("l2sgd-calibration-samples", tmp)) _l2sgd_opt.calibration_samples = tmp;
			if (options::get("l2sgd-calibration-candidates", tmp)) _l2sgd_opt.calibration_candidates = tmp;
			if (options::get("l2sgd-calibration-max-trials", tmp)) _l2sgd_opt.calibration_max_trials = tmp;
			cerr << "*** l2sgd train options: " 
				<< _l2sgd_opt.batch_size << "-"
				<< _l2sgd_opt.c2 << "-"
				<< _l2sgd_opt.period << "-" 
				<< _l2sgd_opt.delta << "-" 
				<< _l2sgd_opt.calibration_eta << "-" 
				<< _l2sgd_opt.calibration_rate << "-" 
				<< _l2sgd_opt.calibration_samples << "-" 
				<< _l2sgd_opt.calibration_candidates << "-" 
				<< _l2sgd_opt.calibration_max_trials
				<< endl;
		}
	}

	void Parser::prepare_train_instances()
	{
		_inst_idx_to_read.clear();

		m_pipe_train.shuffleTrainInstances();
		const int inst_num_train1 = m_pipe_train.getInstanceNum();
        const int real_inst_num_train1_used_one_iter = 
            (_inst_num_from_train1_one_iter > 0 && _inst_num_from_train1_one_iter < inst_num_train1) ? _inst_num_from_train1_one_iter : inst_num_train1;
		for (int i = 0; i < real_inst_num_train1_used_one_iter; ++i) {	// use the first-n1/n2 instances  of each corpus
			_inst_idx_to_read.push_back(i);
		}
		int inst_num_train2 = 0;
		int inst_num_train3 = 0;
		int real_inst_num_train2_used_one_iter = 0;
		int real_inst_num_train3_used_one_iter = 0;
		if (_use_train2) {
			m_pipe_train2.shuffleTrainInstances();
			inst_num_train2 = m_pipe_train2.getInstanceNum();
			real_inst_num_train2_used_one_iter = 
				(_inst_num_from_train2_one_iter > 0 && _inst_num_from_train2_one_iter < inst_num_train2) ? _inst_num_from_train2_one_iter : inst_num_train2;
			for (int i = 0; i < real_inst_num_train2_used_one_iter; ++i) {
				_inst_idx_to_read.push_back(inst_num_train1+i);	// if idx >= inst_num_train1, then it comes from corpus 2.
			}
		}
		if (_use_train3) {
			m_pipe_train3.shuffleTrainInstances();
			inst_num_train3 = m_pipe_train3.getInstanceNum();
			real_inst_num_train3_used_one_iter = 
				(_inst_num_from_train3_one_iter > 0 && _inst_num_from_train3_one_iter < inst_num_train3) ? _inst_num_from_train3_one_iter : inst_num_train3;
			for (int i = 0; i < real_inst_num_train3_used_one_iter; ++i) {
				_inst_idx_to_read.push_back(inst_num_train1+inst_num_train2+i);	// if idx >= inst_num_train1, then it comes from corpus 2.
			}
		}
		cerr << "instance num from train1: " << real_inst_num_train1_used_one_iter << endl;
		if (_use_train2) {
			cerr << "instance num from train2: " << real_inst_num_train2_used_one_iter << endl;
		}
		if (_use_train3) {
			cerr << "instance num from train3: " << real_inst_num_train3_used_one_iter << endl;
		}
		cerr << "instance num total: " << _inst_idx_to_read.size() << endl;
		random_shuffle(_inst_idx_to_read.begin(), _inst_idx_to_read.end());	// randomize the instances from two corpus
	}

	void Parser::pre_train()
	{
		m_pipe_train.openInputFile( _filename_train.c_str() );
		m_pipe_train.getInstancesFromInputFile(0, _inst_max_num_train, _inst_max_len_to_throw);
		const int inst_num_train1 = m_pipe_train.getInstanceNum();
		int inst_num_train2 = 0;
		int inst_num_train3 = 0;
		if (_use_train2) {
			m_pipe_train2.openInputFile( _filename_train2.c_str() );
			m_pipe_train2.getInstancesFromInputFile(inst_num_train1, _inst_max_num_train2, _inst_max_len_to_throw);
			inst_num_train2 = m_pipe_train2.getInstanceNum();
		}
		if (_use_train3) {
			inst_num_train3 = m_pipe_train.getInstanceNum();
			m_pipe_train3.openInputFile( _filename_train3.c_str() );
			m_pipe_train3.getInstancesFromInputFile(inst_num_train1+inst_num_train2, _inst_max_num_train3, _inst_max_len_to_throw);
			//inst_num_train3 = m_pipe_train3.getInstanceNum();
		}

		m_pipe_dev.use_instances_posi(false);
		m_pipe_dev.openInputFile( _filename_dev.c_str() );
		m_pipe_dev.getInstancesFromInputFile(0, _inst_max_num_eval, _inst_max_len_to_throw);
		m_pipe_dev.closeInputFile();
		if (_use_dev2) {
			m_pipe_dev2.use_instances_posi(false);
			m_pipe_dev2.openInputFile( _filename_dev2.c_str() );
			m_pipe_dev2.getInstancesFromInputFile(0, _inst_max_num_eval, _inst_max_len_to_throw);
			m_pipe_dev2.closeInputFile();
		}

		if (!_dictionary_exist) {
			m_fgen.start_generation_mode();
			create_dictionaries(m_pipe_train, true);
			create_dictionaries(m_pipe_dev, false);
			if (_use_dev2) create_dictionaries(m_pipe_dev2, false);
			if (_use_train2) {
				//m_fgen.add_feature_frequency(1000);
				create_dictionaries(m_pipe_train2, true);
			}
			if (_use_train3) {
				//m_fgen.add_feature_frequency(1000);
				create_dictionaries(m_pipe_train3, true);
			}
			
			m_fgen.stop_generation_mode();
			save_dictionaries();
			exit(0);
		}

		load_dictionaries();

		assert(!m_decoder);
		m_decoder = new_decoder();
	}

	void Parser::evaluate( IOPipe &pipe, const bool is_test )
	{
		reset_evaluate_metrics();

		if (pipe.use_instances_posi()) {
			cerr << "do not use_instances_posi for test/dev data: " << pipe.input_filename() << endl;
			exit(-1);
		}
		if (!_tp) {
			cerr << "threadpool not created yet!" << endl;
			exit(-1);
		}

		const int inst_num = pipe.getInstanceNum();
		inst_num_processed_total += inst_num;

		for (int i = 0; i < inst_num; ++i) {
			Instance *inst = pipe.getInstance(i);

			thread_arg_t * arg_parse = new thread_arg_t(this, -1, inst, -1, is_test);
			dispatch_threadpool(_tp, parse_one_inst_thread, (void *)arg_parse);

			if(i % _display_interval == 0) cerr << i << " ";
			if (i % (_display_interval * 10) == 0) print_time();
		}

		wait_all_jobs_done(_tp);

		for (int i = 0; i < inst_num ; ++i) {
			Instance *inst = pipe.getInstance(i);
			evaluate_one_instance(inst);
			if (is_test) {
				if (_test_tag_filter) {
					evaluate_output_tag_filter(inst);
				}
				pipe.writeInstance(inst);
			}
		}

		cerr << "\ninstance num: " << inst_num; print_time();
		if (!is_test) output_evaluate_metrics();
	}

	void Parser::reset_evaluate_metrics() {
		inst_num_processed_total = 0;
		nword = 0;
		ncorrect_joint = 0;
		//ncorrect_a_sum = 0;
		//ncorrect_b_sum = 0;
		ncorrect_a_max = 0;
		ncorrect_b_max = 0;
		noov = 0;
		noov_correct = 0;
	}

	void Parser::output_evaluate_metrics() {
		cerr.precision(5);
		if (ncorrect_joint > 0) cerr << "accuracy (joint):  \t\t" << ncorrect_joint << "/" << nword << " = " << ncorrect_joint*100.0/nword << endl;
		//if (ncorrect_a_sum > 0) cerr << "accuracy (a_sum)    :  \t\t" << ncorrect_a_sum << "/" << nword << " = " << ncorrect_a_sum*100.0/nword << endl;
		if (ncorrect_a_max > 0) cerr << "accuracy (a_max)    :  \t\t" << ncorrect_a_max << "/" << nword << " = " << ncorrect_a_max*100.0/nword << endl;
		//if (ncorrect_b_sum > 0) cerr << "accuracy (b_sum)    :  \t\t" << ncorrect_b_sum << "/" << nword << " = " << ncorrect_b_sum*100.0/nword << endl;
		if (ncorrect_b_max > 0) cerr << "accuracy (b_max)    :  \t\t" << ncorrect_b_max << "/" << nword << " = " << ncorrect_b_max*100.0/nword << endl;

		//cerr << "POS Prec(OOV):  \t\t" << oov_pos_correct_num << "/" << oov_num << " = " << (oov_num > 0 ? oov_pos_correct_num*100.0/oov_num : 0.0) << endl;
	}
/*
	void Parser::assign_1_best_tag_seq( Instance *inst )
	{
		const int len = inst->size();
		assert(inst->filtered_tags.size() == len);
		inst->predicted_postags.clear();
		inst->predicted_postags.resize(len);

		for (int wi = 1; wi < len; ++wi) {
			const vector<string> &tags = inst->filtered_tags[wi];
			assert(!tags.empty());
			inst->predicted_postags[wi] = tags[0];

		}
	}
*/
	void Parser::get_best_tag_seq( Instance *inst, const int joint_a_b, vector<string> &predicted_tags )// 0-joint,1-a,2-b
	{
		const int len = inst->size();
		const NRMat<double> &probs = (joint_a_b == 0 ? inst->prob_unigram_joint : 
				(joint_a_b == 1 ? inst->prob_unigram_a : 
				 inst->prob_unigram_b));
		const int T = probs.ncols();
		predicted_tags.clear();
		predicted_tags.resize(len);
		for (int wi = 1; wi < len; ++wi) {
			double maxp = -1;
			int maxt = -1;
			for (int ti = 0; ti < T; ++ti) {
				if (probs[wi][ti] > maxp + EPS) {
					maxp = probs[wi][ti];
					maxt = ti;
				}
			}
			predicted_tags[wi] = pos_id_2_str(joint_a_b, maxt);
		}
	}

	void Parser::filter_tag( Instance *inst, const int joint_a_b )// 0-joint,1-a,2-b
	{
		const int len = inst->size();
		const NRMat<double> &probs = (joint_a_b == 0 ? inst->prob_unigram_joint : 
				(joint_a_b == 1? inst->prob_unigram_a : 
				 inst->prob_unigram_b));
		const int T = probs.ncols();

		inst->filtered_tags.clear();
		inst->filtered_tags.resize(len);	
		inst->filtered_probs.clear();
		inst->filtered_probs.resize(len);
		for (int wi = 1; wi < len; ++wi) {
			vector<int> tag_id;
			tag_id.push_back(0);
			for (int ti = 1; ti < T; ++ti) {
				tag_id.push_back(-1);
				int i = tag_id.size() - 2;
				for (; i >= 0; --i) {
					if (probs[wi][ti] < probs[wi][ tag_id[i] ] - EPS) {
						tag_id[i+1] = ti;
						break;
					} else {
						tag_id[i+1] = tag_id[i];
					}
				}
				if (i < 0) {
					tag_id[0] = ti;
				}
			}

			inst->filtered_tags[wi].push_back( pos_id_2_str(joint_a_b, tag_id[0]) );
			inst->filtered_probs[wi].push_back( probs[wi][tag_id[0]] );
			for (int i = 1; i < tag_id.size(); ++i) {
				if (probs[wi][ tag_id[i] ] > _test_tag_filter_lambda * inst->filtered_probs[wi][ 0 ] + EPS) {
					inst->filtered_tags[wi].push_back( pos_id_2_str(joint_a_b, tag_id[i]) );
					inst->filtered_probs[wi].push_back( probs[wi][tag_id[i]] );
				} else {
					break;
				}
			}
		}
	}

	void Parser::test(const int iter)
	{
		assert(iter >= 1);
		cerr << "\n\n eval: " << iter; print_time();

		m_pipe_test.use_instances_posi(false);
		m_pipe_test.openInputFile( _filename_test.c_str() );
		m_pipe_test.openOutputFile( _filename_output.c_str() );

		if (!_train) {
			load_dictionaries();
			assert(!m_decoder);
			m_decoder = new_decoder();
		}

		load_parameters(iter);

		if (_test_tag_filter) {
			string filename = _filename_output + ".prob";
			_of_tag_filter_prob.open(filename.c_str());
			if (!_of_tag_filter_prob.is_open()) {
				cerr << "Parser:: open tag filter prob file err: " << filename << endl;
				exit(-1);
			}
			initialize_filter_stat();
		}

		reset_evaluate_metrics();

		int start_id = 0;
		while (1) {
			const int inst_num_left = _inst_max_num_eval < 0 ? _test_batch_size : (_inst_max_num_eval - inst_num_processed_total);
			if (inst_num_left <= 0) break;

			m_pipe_test.getInstancesFromInputFile(start_id,		
				_test_batch_size < inst_num_left ? _test_batch_size : inst_num_left,
				_inst_max_len_to_throw);

			if (m_pipe_test.getInstanceNum() <= 0) break;

			start_id += m_pipe_test.getInstanceNum();

			_mbr_decoding = true;
			evaluate(m_pipe_test, true);
			m_pipe_test.dealloc_instance();
		}
		cerr << "done";  print_time(); 
		output_evaluate_metrics();

		if (_test_tag_filter) {
			output_filter_stat();
			_of_tag_filter_prob.close();
		}

		m_pipe_test.closeInputFile();
		m_pipe_test.closeOutputFile();
	}


	void Parser::create_dictionaries(IOPipe &pipe, const bool collect_word)
	{
		cerr << "\ncreating dictionaries from " << pipe.input_filename(); print_time();
		m_fgen.get_pos_id(DUMMY_CPOSTAG); // a special tag for begin/end positions
		m_fgen.get_pos_id_a(DUMMY_CPOSTAG_SEP); 
		m_fgen.get_pos_id_b(DUMMY_CPOSTAG_SEP); 

		for (int i = 0; i < pipe.getInstanceNum(); ++i) {
			Instance *inst = pipe.getInstance(i);
			m_fgen.create_all_pos_features_when_create_dict(inst, collect_word);

			if (pipe.use_instances_posi()) {
				delete inst;
			}

			if (i % _display_interval == 0) cerr << i << " ";
			if (i % (_display_interval * 10) == 0) print_time();
		}
		cerr << "\ninstance num: " << pipe.getInstanceNum() << endl;
		cerr << "create dictionaries done"; print_time();
	}


	void Parser::dot_all( const fvec * const fs, double * const probs, const int sz ) const
	{
		for (int i = 0; i < sz; ++i) {
			if (fs[i].n >= 0)
				probs[i] = m_param.dot(fs+i);
		}
	}

	void Parser::compute_all_probs( Instance *inst ) const
	{
		dot_all(inst->fvec_unigram_joint.c_buf(), inst->prob_unigram_joint.c_buf(), inst->fvec_unigram_joint.size());
		dot_all(inst->fvec_bigram_joint.c_buf(), inst->prob_bigram_joint.c_buf(), inst->fvec_bigram_joint.size());
		dot_all(inst->fvec_unigram_a.c_buf(), inst->prob_unigram_a.c_buf(), inst->fvec_unigram_a.size());
		dot_all(inst->fvec_bigram_a.c_buf(), inst->prob_bigram_a.c_buf(), inst->fvec_bigram_a.size());
		dot_all(inst->fvec_unigram_b.c_buf(), inst->prob_unigram_b.c_buf(), inst->fvec_unigram_b.size());
		dot_all(inst->fvec_bigram_b.c_buf(), inst->prob_bigram_b.c_buf(), inst->fvec_bigram_b.size());
	}

	void Parser::verify_decoding_algorithm( Instance * const inst )
	{
		sparsevec fv_dist = inst->predicted_fv;
		m_fgen.create_all_pos_features_according_to_tree(inst, fv_dist, inst->predicted_tags_joint, -1.0);

		bool not_matched = false;
		for (sparsevec::const_iterator it = fv_dist.begin(); it != fv_dist.end(); ++it) {
			if (it->second > 1e-3 || it->second < -1e-3) {
				not_matched = true;
				cerr << "[inst: " << inst->id << "] viterbi - according-to-tree: not matched feature: " << it->first << "\t value: " << it->second << endl;
			}
		}
		if (not_matched) {
			cerr << "[inst: " << inst->id << "] verify_decoding_algorithm error: two FeatureVec (by two ways: viterbi vs. according-to-tree) not matched" << endl;
			throw(1);
		}
	}

	void Parser::evaluate_one_instance( const Instance * const inst )
	{
		const int len = inst->size();
		nword += len - 1;
		for (int i = 1; i < len; ++i) {
			const string &gold = inst->cpostags[i];
			const string &sys = inst->predicted_tags_joint[i];
			if (gold == "*") continue;

			vector<string> vecgold;
			vector<string> vecsys;
			simpleTokenize(gold, vecgold, "^");
			simpleTokenize(sys, vecsys, "^");
			assert(vecgold.size() <= 2 && vecsys.size() == 2);

			//if (vecgold.size() != 2 || vecsys.size() != 2) continue;
			//if (vecgold[0] != "*" && vecgold[1] != "*") continue;
			
			if (gold == sys) ++ncorrect_joint;
			const string &gold1 = vecgold[0];
			const string &gold2 = vecgold[vecgold.size()-1];
			if (gold1 != "*") {
				//if (gold1 == inst->predicted_tags_a[i]) ++ncorrect_a_sum;
				if (gold1 == vecsys[0]) ++ncorrect_a_max;
			}
			if (gold2 != "*") {
				//if (gold2 == inst->predicted_tags_b[i]) ++ncorrect_b_sum;
				if (gold2 == vecsys[1]) ++ncorrect_b_max;
			}
		}
		//word_punc_num_pos_correct += len - 1 - error_num_pos(inst);
		//eval_oov_pos(inst, oov_num, oov_pos_correct_num);
	}

	
	void Parser::l2sgd( 
		const int N, 
		const floatval_t t0, 
		const floatval_t lambda, 
		const int num_epochs, 
		const bool calibration, 
		const int period, 
		const floatval_t epsilon)
	{
		if (!_tp) {
			cerr << "no thread pool created yet!" << endl;
			exit(-1);
		}

		const int K = m_fgen.feature_dimentionality();
		_t0 = t0;
		_lambda = lambda;
		_t = 0;

		vecset(m_param.c_buf(), 0, K);

		vector<floatval_t> pf;
		if (!calibration) pf.resize(period);
        _best_iter_num_so_far_a = 0;
        _best_accuracy_a = 0.;
        _best_iter_num_so_far_b = 0;
        _best_accuracy_b = 0.;

		int epoch = 1;
		for (; epoch <= num_epochs; ++epoch) {
			if (!calibration) {
				cerr << "\n***** Iteration #" << epoch << " *****"; print_time();
				prepare_train_instances();
			} 

			_sum_loss = 0.;
			_decay = 1.;

			assert (_l2sgd_opt.batch_size > 0); 
			int i = 0;
			while (i < N) {
				/* Update various factors. */
				_eta = 1 / (_lambda * (_t0 + _t));
				_decay *= (1.0 - _eta * _lambda);
				_gain = _eta / _decay;

				m_param.set_scale(_decay);

				// store the updates in _g
				_g.resize(K);
				fill(_g.begin(), _g.end(), 0.);

				sp_thread_mutex_init( &_mutex, NULL );
				int b = 0;
				for (; b < _l2sgd_opt.batch_size; ++b,++i) {
					if (i >= N) break;
					Instance *inst = get_instance(i);						
					thread_arg_t *arg_update_one_inst = new thread_arg_t(this, -1, inst, i);
					dispatch_threadpool(_tp, train_update_one_inst_thread, (void *)arg_update_one_inst);
				}
				_t += b;
				wait_all_jobs_done(_tp);
				sp_thread_mutex_destroy( &_mutex );

				// do the updates
				for (int k = 0; k < K; ++k) m_param.c_buf()[k] += _g[k];
			}

			/* Scale the feature weights. */
			vecscale(m_param.c_buf(), _decay, K);
			//_decay = 1.;
			/* Include the L2 norm of feature weights to the objective. */
			/* The factor N is necessary because lambda = 2 * C / N. */
			const floatval_t norm2 = vecdot(m_param.c_buf(), m_param.c_buf(), K);
			_sum_loss += 0.5 * lambda * norm2 * N;

			cerr << "instance num: " << N; print_time();
			/* One epoch finished. */
			if (!calibration) {
				/* We don't test the stopping criterion while period < epoch. */
				const floatval_t improvement = period < epoch ? 
					(pf[(epoch-1) % period] - _sum_loss) / _sum_loss : epsilon;

				/* Store the current value of the objective function. */
				pf[(epoch-1) % period] = _sum_loss;

				cerr << "Loss: " <<  _sum_loss << endl;
				if (period < epoch) {
					cerr << "Improvement ratio: " << improvement << endl;
				}
				cerr << "Feature L2-norm: " << sqrt(norm2) << endl;
				cerr << "Learning rate (eta): " << _eta << endl;
				cerr << "Total number of feature updates: " << _t;
	

				m_param.set_scale(1.);

				/* Holdout evaluation if necessary. */
				//_mbr_decoding = false;
				//evaluate(m_pipe_dev, false);
				//cerr << "\n*** mbr decoding ***" << endl;
				
				_mbr_decoding = true;
				evaluate(m_pipe_dev, false);
                double this_accuracy_a = 100.0 * ncorrect_a_max / nword;
				double this_accuracy_b = 100.0 * ncorrect_b_max / nword;
				if (_use_dev2) {
					evaluate(m_pipe_dev2, false);
					this_accuracy_a = max(this_accuracy_a, 100.0 * ncorrect_a_max / nword);
					this_accuracy_b = max(this_accuracy_b, 100.0 * ncorrect_b_max / nword);
				}

				vector<int> del;
				if (this_accuracy_a > _best_accuracy_a + 1e-5) { 
					if (_best_iter_num_so_far_a > 0) {
						del.push_back(_best_iter_num_so_far_a);
					}
                    _best_iter_num_so_far_a = epoch;
                    _best_accuracy_a = this_accuracy_a;
				}

				if (this_accuracy_b > _best_accuracy_b + 1e-5) { 
					if (_best_iter_num_so_far_b > 0) {
						if (_best_iter_num_so_far_b != _best_iter_num_so_far_a) del.push_back(_best_iter_num_so_far_b);
						if (del.size() == 2 && del[0] == del[1]) {
							del.pop_back();
						}
					}
                    _best_iter_num_so_far_b = epoch;
                    _best_accuracy_b = this_accuracy_b;
				} else {
					if (del.size() > 0) {
						assert(del.size() == 1);
						if (del[0] == _best_iter_num_so_far_b) del.pop_back();
					}
				}

				for (int deli = 0; deli < del.size(); ++deli) delete_parameters(del[deli]);
				
				if (_best_iter_num_so_far_a > 0) cerr << "\nbest accuracy so far (a): " << _best_accuracy_a << " [it = " << _best_iter_num_so_far_a << "]" << endl;
				if (_best_iter_num_so_far_b > 0) cerr << "\nbest accuracy so far (b): " << _best_accuracy_b << " [it = " << _best_iter_num_so_far_b << "]" << endl;

                if (_best_iter_num_so_far_a == epoch || _best_iter_num_so_far_b == epoch) save_parameters(epoch);
                if (_best_iter_num_so_far_a+30 < epoch && _best_iter_num_so_far_b+30 < epoch) {
					cerr << "\n\n*** training stops due to no accuracy increase in many epochs\n" << endl;
					break;
				}

				/* Check for the stopping criterion. */
				// Do not check this since the loss is not meaningful when _inst_num_from_train2_one_iter
				//if (improvement < epsilon) {
				//	break;
				//}
			}
		}

		/* Output the optimization result. */
		if (!calibration) {
			if (epoch < num_epochs) {
				cerr << "SGD terminated with the stopping criteria\n";
			} else {
				cerr << "SGD terminated with the maximum number of iterations\n";
			}
		}
	}



	floatval_t Parser::l2sgd_calibration() {
		/* Initialize a permutation that shuffles the instances. */
		prepare_train_instances();
		
		int i;
		int dec = 0, ok, trials = 1;
		int num = _l2sgd_opt.calibration_candidates;

		floatval_t best_loss = DBL_MAX;
		floatval_t eta = _l2sgd_opt.calibration_eta;
		floatval_t best_eta = _l2sgd_opt.calibration_eta;
		const int N = get_inst_num_one_iter();
		const int S = min(N, _l2sgd_opt.calibration_samples);
		const int K = m_fgen.feature_dimentionality();
		const floatval_t init_eta = _l2sgd_opt.calibration_eta;
		const floatval_t rate = _l2sgd_opt.calibration_rate;
		const floatval_t lambda = _l2sgd_opt.lambda;

		cerr << "Calibrating the learning rate (eta)\n";
		cerr << "calibration.eta: " << eta << endl;
		cerr << "calibration.rate: " << rate << endl;
		cerr << "calibration.samples: " << S << endl;
		cerr << "calibration.candidates: " << num << endl;
		cerr << "calibration.max_trials: " << _l2sgd_opt.calibration_max_trials << endl;

		/* Initialize feature weights as zero. */
		vecset(m_param.c_buf(), 0, K);

		/* Compute the initial loss. */
		m_param.set_scale(1.);
		_l2sgd_calibration_init_loss = 0;
		sp_thread_mutex_init( &_mutex, NULL );
		for (i = 0; i < S; ++i) {
			Instance *inst = get_instance(i);
			thread_arg_t *arg_update_one_inst = new thread_arg_t(this, -1, inst, i);
			dispatch_threadpool(_tp, l2sgd_calibration_compute_init_loss, (void *)arg_update_one_inst);
		}
		wait_all_jobs_done(_tp);
		sp_thread_mutex_destroy( &_mutex );

		_l2sgd_calibration_init_loss += _l2sgd_opt.c2 * vecdot(m_param.c_buf(), m_param.c_buf(), K);
		_l2sgd_calibration_init_loss += 0.5 * lambda * vecdot(m_param.c_buf(), m_param.c_buf(), K) * N;
		cerr << "Initial loss: " << _l2sgd_calibration_init_loss << endl;


		//init_loss += 0.5 * lambda * vecdot(m_param.c_buf(), m_param.c_buf(), K) * N;
		//cerr << "Initial loss: " << init_loss << endl;

		while (num > 0 || !dec) {
			cerr.precision(15);
			cerr << "Trial #" << trials << " (eta = " << eta << ")" << endl;

			/* Perform SGD for one epoch. */
			l2sgd(S, 1.0 / (lambda * eta), lambda, 1, true, 1, 0.);

			/* Make sure that the learning rate decreases the log-likelihood. */
			ok = isfinite(_sum_loss) && (_sum_loss < _l2sgd_calibration_init_loss);
			if (ok) {
				cerr << "Loss: " << _sum_loss << endl;
				--num;
			} else {
				cerr << "Loss: " << _sum_loss << " (worse)\n";
			}

			if (isfinite(_sum_loss) && _sum_loss < best_loss) {
				best_loss = _sum_loss;
				best_eta = eta;
			}

			if (!dec) {
				if (ok && 0 < num) {
					eta *= rate;
				} else {
					dec = 1;
					num = _l2sgd_opt.calibration_candidates;
					eta = init_eta / rate;
				}
			} else {
				eta /= rate;
			}

			++trials;
			if (_l2sgd_opt.calibration_max_trials <= trials) {
				break;
			}
		}

		eta = best_eta;

		cerr << "Best learning rate (eta): " << eta; print_time();
		//cerr << "exit here" << endl;
		//exit(-1);
		return 1.0 / (lambda * eta);
	}
	

	void Parser::l2sgd_calibration_compute_init_loss( void *arg )
	{
		Parser *par = ((thread_arg_t *)arg)->_parser;
		Instance *inst = ((thread_arg_t *)arg)->_inst;
		const int inst_idx =  ((thread_arg_t *)arg)->_inst_idx;
		delete ((thread_arg_t *)arg);

		Decoder *decoder = new_decoder();
		par->m_fgen.create_all_feature_vectors(inst);
		par->compute_all_probs(inst);

		// the complete space
		decoder->compute_marginals(inst, false);

		sp_thread_mutex_lock(&_mutex);
		par->_l2sgd_calibration_init_loss += decoder->log_Z(inst);
		sp_thread_mutex_unlock(&_mutex);

		const bool constrained = (!inst->constrained_tags_str.empty());

		if (constrained) {
			// the constrained space
			par->m_fgen.create_constrained_tag_matrix(inst);
			decoder->compute_marginals(inst, true);
			sp_thread_mutex_lock(&_mutex);
			par->_l2sgd_calibration_init_loss -= decoder->log_Z(inst);
			sp_thread_mutex_unlock(&_mutex);
		} else {
			sparsevec sp_fv;
			par->m_fgen.create_all_pos_features_according_to_tree(inst, sp_fv, inst->cpostags);
			const double score = par->m_param.dot(sp_fv);
			sp_thread_mutex_lock(&_mutex);
			par->_l2sgd_calibration_init_loss -= score;
			sp_thread_mutex_unlock(&_mutex);
		}

		//NO_ADD_INIT_LOSS:
		par->m_fgen.dealloc_fvec_prob(inst);
		par-> delete_one_train_instance_after_update_gradient(inst);
		delete_decoder(decoder);
	}


	void Parser::train_lbfgs_crfpp() {

		CRFPP::LBFGS lbfgs;

		prepare_train_instances();
		const int N = get_inst_num_one_iter();
		const int K = m_fgen.feature_dimentionality();

		double old_obj = 1e+37;
		int    converge = 0;

		/* Allocate arrays. */
		m_param.realloc(K);
		double *w = m_param.c_buf();
		_g.resize(K);

		_best_accuracy_a = 0.;
		_best_iter_num_so_far_a = -1;

		for (size_t itr = 0; itr < _iter_num; ++itr) {

			if (0 != itr) prepare_train_instances();

			std::fill(_g.begin(), _g.end(), 0);
			_sum_loss = 0;

			objective_and_gradients_batch(this, w, _sum_loss, &_g[0], K);

			size_t num_nonzero = 0;
			double norm = .0, gnorm = .0;
			for (size_t k = 0; k < K; ++k) {
				if (_lbfgs_crfpp_opt.isL2) {
					_sum_loss += (w[k] * w[k] /(2.0 * _lbfgs_crfpp_opt.c));
					_g[k] += w[k] / _lbfgs_crfpp_opt.c;
				} else {
					_sum_loss += std::abs(w[k] / _lbfgs_crfpp_opt.c);
				}
				norm += w[k] * w[k];
				gnorm += _g[k] * _g[k];				
				if (w[k] != 0.0) {
					++num_nonzero;
				}
			}

			double diff = (itr == 0 ? 1.0 :
				std::abs(old_obj - _sum_loss)/old_obj);
			std::cerr << "iter="  << itr
				//	<< " terr=" << 1.0 * thread[0].err / all
				//	<< " serr=" << 1.0 * thread[0].zeroone / x.size()
				<< " ||w||=" << norm
				<< " ||g||=" << gnorm
				<< " non-zero=" << num_nonzero
				<< " obj=" << _sum_loss
				<< " diff="  << diff << std::endl;
			old_obj = _sum_loss;

			if (diff < _lbfgs_crfpp_opt.eta) {
				converge++;
			} else {
				converge = 0;
			}

			//if (itr > _iter_num || converge == 3) {
			if (itr > _iter_num || converge == 30) {
				break;  // 3 is ad-hoc
			}

			lbfgs.optimize(K,
				w,
				_sum_loss,
				&_g[0],
				!_lbfgs_crfpp_opt.isL2, _lbfgs_crfpp_opt.c);

			//_mbr_decoding = false;
			//evaluate(m_pipe_dev, false);
			cerr << "*** mbr decoding ***" << endl;
			_mbr_decoding = true;
			evaluate(m_pipe_dev, false);

			double this_accuracy = 100.0 * ncorrect_a_max / nword;
			if (this_accuracy > _best_accuracy_a + 1e-5) {
				if (_best_iter_num_so_far_a >= 0) delete_parameters(_best_iter_num_so_far_a);
				_best_iter_num_so_far_a = itr;
				_best_accuracy_a = this_accuracy;
			}
			if (_best_iter_num_so_far_a == itr) save_parameters(itr);
			cerr << "\nbest accuracy (a-sum) so far: " << _best_accuracy_a << " [it = " << _best_iter_num_so_far_a << "]" << endl;
		}
	}

	void Parser::objective_and_gradients_batch( Parser *par, const floatval_t * const w,
		floatval_t &f, floatval_t * const g, const int n )
	{
		if (!par->_tp) {
			cerr << "no thread pool created yet!" << endl;
			exit(-1);
		}

		_gain = -1.;
		
	/*	if (par->m_pipe_train.use_instances_posi()) {				
			cerr << "Parser::objective_and...() not tested on use_instances_posi)" << endl; 	
			exit(-1);
		}	*/

		sp_thread_mutex_init( &_mutex, NULL );

		const int inst_num = get_inst_num_one_iter();
		for (int i = 0; i < inst_num; ++i) {
			Instance *inst = get_instance(i);
			thread_arg_t *arg_update_one_inst = new thread_arg_t(par, -1, inst, i);
			dispatch_threadpool(par->_tp, train_update_one_inst_thread, (void *)arg_update_one_inst);
		}

		wait_all_jobs_done(par->_tp);
		sp_thread_mutex_destroy( &_mutex );

		cerr << "\ninstance num: " << inst_num << endl;
	}

	void Parser::train_l2sgd() {

		prepare_train_instances();
		const int N = get_inst_num_one_iter();

		const int K = m_fgen.feature_dimentionality();

		/* Allocate arrays. */
		m_param.realloc(K);
		_l2sgd_opt.lambda = 2. * _l2sgd_opt.c2 / N;

        cerr.precision(5);
		cerr << "Stochastic Gradient Descent (SGD)"; print_time();
		cerr << "batch size: " << _l2sgd_opt.batch_size << endl;
		cerr << "c2: " << _l2sgd_opt.c2 << endl;
		cerr << "max-iterations: " << _iter_num << endl;
		cerr << "period: " << _l2sgd_opt.period << endl;
		cerr << "delta: " << _l2sgd_opt.delta << endl << endl;

		/* Calibrate the training rate (eta). */
		_l2sgd_opt.t0 = l2sgd_calibration();
		cerr << "t0: " << _l2sgd_opt.t0 << endl;

		/* Perform stochastic gradient descent. */
		floatval_t loss = 0;
		l2sgd(N, _l2sgd_opt.t0, _l2sgd_opt.lambda, _iter_num, false, 
			_l2sgd_opt.period, _l2sgd_opt.delta);
	}

	void Parser::update_gradient_one_inst( Parser *par, Decoder *decoder, const Instance *inst, sparsevec &g, const double gain )
	{
		const int K = par->m_fgen.feature_dimentionality();
		const int len = inst->size();
		const int T = Decoder::T;
		{
			for (int i = 1; i < len; ++i) {
				for (int t = 0; t < T; ++t) {
					const double step = gain * decoder->marginal_prob(inst, i, t);
					if (!inst->fvec_unigram_joint.empty()) update_gradient(g, inst->fvec_unigram_joint[i][t], step, K);
					if (!inst->fvec_unigram_a.empty()) update_gradient(g, inst->fvec_unigram_a[i][ joint_id_2_a[t] ], step, K);
					if (!inst->fvec_unigram_b.empty()) update_gradient(g, inst->fvec_unigram_b[i][ joint_id_2_b[t] ], step, K);
				}
			}
		}
		{					
			for (int i = 1; i <= len; ++i) {
				for (int t = 0; t < T; ++t) {
					if (i == len && t != Decoder::pos_id_dummy) continue;
					for (int tL1 = 0; tL1 < T; ++tL1) {
						const double step = gain * decoder->marginal_prob(inst, i, tL1, t);
						if (!inst->fvec_bigram_joint.empty()) update_gradient(g, inst->fvec_bigram_joint[i][tL1][t], step, K);
						if (!inst->fvec_bigram_a.empty()) update_gradient(g, inst->fvec_bigram_a[i][ joint_id_2_a[tL1] ][ joint_id_2_a[t] ], step, K);
						if (!inst->fvec_bigram_b.empty()) update_gradient(g, inst->fvec_bigram_b[i][ joint_id_2_b[tL1] ][ joint_id_2_b[t] ], step, K);
					}
				}
			}
		}

	}

	void Parser::update_gradient( sparsevec &g, const fvec &fv, const double marg, const int n )
	{
		assert(fv.n >= 0);
		if (equal_to(marg, 0)) return;
		for (int i = 0; i < fv.n; ++i) {
			const int id = fv.offset + fv.idx[i];
			assert(id >= 0 && id < n);
			//cerr << g[id] << " ";
			if (g.find(id) == g.end()) g[id] = .0;
			g[id] += (fv.val ? fv.val[i] * marg : marg);
			//cerr << g[id] << endl;
		}
	}

	void Parser::update_gradient_one_inst( Parser *par, Decoder *decoder, const Instance *inst, double *g, const double gain )
	{
		const int K = par->m_fgen.feature_dimentionality();
		const int len = inst->size();
		const int T = Decoder::T;
		{
			for (int i = 1; i < len; ++i) {
				for (int t = 0; t < T; ++t) { 
					const double step = gain * decoder->marginal_prob(inst, i, t);
					if (!inst->fvec_unigram_joint.empty()) update_gradient(g, inst->fvec_unigram_joint[i][t], step, K);
					if (!inst->fvec_unigram_a.empty()) update_gradient(g, inst->fvec_unigram_a[i][ joint_id_2_a[t] ], step, K);
					if (!inst->fvec_unigram_b.empty()) update_gradient(g, inst->fvec_unigram_b[i][ joint_id_2_b[t] ], step, K);
				}
			}
		}
		{					
			for (int i = 1; i <= len; ++i) {
				for (int t = 0; t < T; ++t) {
					if (i == len && t != Decoder::pos_id_dummy) continue;
					for (int tL1 = 0; tL1 < T; ++tL1) {
						const double step = gain * decoder->marginal_prob(inst, i, tL1, t);
						if (!inst->fvec_bigram_joint.empty()) update_gradient(g, inst->fvec_bigram_joint[i][tL1][t], step, K);
						if (!inst->fvec_bigram_a.empty()) update_gradient(g, inst->fvec_bigram_a[i][ joint_id_2_a[tL1] ][ joint_id_2_a[t] ], step, K);
						if (!inst->fvec_bigram_b.empty()) update_gradient(g, inst->fvec_bigram_b[i][ joint_id_2_b[tL1] ][ joint_id_2_b[t] ], step, K);
					}
				}
			}
		}

	}

	void Parser::update_gradient( floatval_t *g, const fvec &fv, const double marg, const int n )
	{
		assert(fv.n >= 0);
		if (equal_to(marg, 0)) return;
		for (int i = 0; i < fv.n; ++i) {
			const int id = fv.offset + fv.idx[i];
			assert(id >= 0 && id < n);
			//cerr << g[id] << " ";
			g[id] += (fv.val ? fv.val[i] * marg : marg);
			//cerr << g[id] << endl;
		}
	}

	void Parser::parse_one_inst_thread( void *arg )
	{
		Parser *par = ((thread_arg_t *)arg)->_parser;
		Instance *inst = ((thread_arg_t *)arg)->_inst;
		bool is_test = ((thread_arg_t *)arg)->_is_test;
		delete ((thread_arg_t *)arg);

		Decoder *decoder = new_decoder();
		par->parse(decoder, inst, is_test);
		delete_decoder(decoder);
	}

	void Parser::train_update_one_inst_thread( void *arg )
	{
		Parser *par = ((thread_arg_t *)arg)->_parser;
		Instance *inst = ((thread_arg_t *)arg)->_inst;
		const int inst_idx =  ((thread_arg_t *)arg)->_inst_idx;
		delete ((thread_arg_t *)arg);

		const int K = par->m_fgen.feature_dimentionality();

		Decoder *decoder = new_decoder();
		par->m_fgen.create_all_feature_vectors(inst);
		par->compute_all_probs(inst);

		double *g = &_g[0];
		sparsevec sparse_g;

		// whole space
		decoder->compute_marginals(inst, false);	// can not use par->m_decoder
		double loss = decoder->log_Z(inst);
			
		if (_gradient_update_allow_conflict) {
			update_gradient_one_inst(par, decoder, inst, g, -_gain);
		} else {
			update_gradient_one_inst(par, decoder, inst, sparse_g, -_gain);
		}

		const bool constrained = (! inst->constrained_tags_str.empty());

		// constrained space
		if (constrained) {
			par->m_fgen.create_constrained_tag_matrix(inst);
			decoder->compute_marginals(inst, true);
			if (_gradient_update_allow_conflict) {
				update_gradient_one_inst(par, decoder, inst, g, _gain);
			} else {
				update_gradient_one_inst(par, decoder, inst, sparse_g, _gain);
			}
			loss -= decoder->log_Z(inst);
		} else {
			const double score_gold = (
					_gradient_update_allow_conflict ? 
					par->update_weights_or_gradients_with_gold_tree(inst, g, _gain) :
					par->update_weights_or_gradients_with_gold_tree(inst, sparse_g, _gain) );
			loss -= score_gold;
		}
		
		par->m_fgen.dealloc_fvec_prob(inst);
		par->delete_one_train_instance_after_update_gradient(inst);
		delete_decoder(decoder);

		sp_thread_mutex_lock(&_mutex);
		_sum_loss += loss;
		if (!_gradient_update_allow_conflict) {
			sparsevec::const_iterator V_i = sparse_g.begin();
			const sparsevec::const_iterator V_end = sparse_g.end();
			for(; V_i != V_end; ++V_i) {
				const int id = V_i->first;
				const double val = V_i->second;
				assert(id < K && id >= 0);
				g[id] += val;
			}
		}
		sp_thread_mutex_unlock(&_mutex);
		if (inst_idx % par->_display_interval == 0) cerr << inst_idx << " ";
		if (inst_idx % (par->_display_interval * 10) == 0) print_time();
	}


} // namespace dparser


