#include #include #include #include #include #include #include #include #include #include #include namespace po = boost::program_options; using namespace std; using namespace boost::unordered; using namespace boost::bimaps; // raw row as read from input file typedef vector stringrow; // row with anonymized integer alphabet typedef vector row; // rowtype structure, containing protoype row and number of occurences struct rowtype { row prototype; int number; }; // vector of row types typedef vector V_rowtypes; // 2vector of possible rows typedef vector< vector > VV_rows; // pattern vector type as vector of boolean, true mean to keep symbol typedef vector patternvector; // hashset of pattern vectors typedef unordered_set S_patternvectors; // vector of pattern vectors typedef vector V_patternvectors; // costs and bounds vectors typedef vector V_costs; typedef vector V_numbers; // mapping rows to the correct index of the corresponding row type typedef unordered_map M_row_index; // mapping output types to compatible rows typedef unordered_set S_indices; typedef vector< vector < S_indices > > VV_S_indices; typedef unordered_map M_outputtype_index; typedef vector V_M_outputtype_index; // // data structure storing compatiblity of rows, pattern vectors and output types // struct compatiblity_triple // { // int rowtype_id; // int patternvector_id; // int outputtype_id; // }; // typedef unordered_set S_compatiblity; // std::size_t hash_value ( compatiblity_triple const& t ) // { // boost::hash hasher; // return hasher ( t.rowtype_id ) xor hasher ( t.patternvector_id ) xor hasher ( t.outputtype_id ); // }; // bool operator== ( compatiblity_triple const& a, compatiblity_triple const& b ) // { // return ( a.rowtype_id == b.rowtype_id ) && ( a.patternvector_id == b.patternvector_id ) && ( a.outputtype_id == b.outputtype_id ); // } // bi-mapping symbols to integers struct strsymbol {}; struct intsymbol {}; typedef bimap < tagged, tagged > M_alphabet; typedef M_alphabet::value_type symbol_mapping; typedef M_alphabet::left_iterator strsymboliterator; typedef M_alphabet::right_iterator intsymboliterator; // solution mapping struct index_mapping { int x; int y; }; typedef unordered_map M_rowtype_patternvector; std::size_t hash_value ( index_mapping const& t ) { boost::hash hasher; return hasher ( t.x ) xor hasher ( t.y ); }; bool operator== ( index_mapping const& a, index_mapping const& b ) { return ( a.x == b.x ) && ( a.y == b.y ); } // storing invalid mapping from row to pv typedef unordered_set< index_mapping > S_indexpairs; typedef boost::tokenizer< boost::escaped_list_separator > CSVsep; typedef vector Ranking; typedef vector Rankings; typedef IloArray IntVarMatrix; typedef IloArray Bool2Array; typedef IloArray Bool3Array; bool computeSolution ( VV_S_indices& C, S_indexpairs& invalidIJ, S_indexpairs& invalidJL, V_costs& w, int n, V_numbers& ni, V_numbers& pj, int m, int k, M_rowtype_patternvector& xIJmap, int& totalcosts, bool outlier_detection, int outlier_vector_id ) { bool success = false; IloEnv env; try { // define constants cout << " Defining constants...\n"; IloInt n_ILP = n; IloInt m_ILP = m; IloInt k_ILP = k; // cout <<"\n n="<< n // <<" m="<< m // <<" k="<< k // <<" ni.size="<< ni.size() // <<" pj.size="<< pj.size() // <<" C.size="<< C.size() // <<" w.size"<< w.size() <<"\n"; IloIntArray ni_ILP ( env, ni.size() ); for ( IloInt i = 0; i < ni.size(); i++ ) { ni_ILP[i] = ni.at ( i ); } IloIntArray w_ILP ( env, w.size() ); for ( IloInt j = 0; j < w.size(); j++ ) { w_ILP[j] = w.at ( j ); } // define variables cout << " Defining variables...\n"; IntVarMatrix xIJ_ILP ( env, ni.size() ); for ( IloInt i = 0; i < ni.size(); i++ ) { xIJ_ILP[i] = IloIntVarArray ( env, pj.size(), 0, IloInfinity ); } IntVarMatrix uJL_ILP ( env, w.size() ); for ( IloInt j = 0; j < pj.size(); j++ ) { uJL_ILP[j] = IloIntVarArray ( env, pj.at ( j ), 0, IloInfinity ); } IloExpr profit ( env ); IloModel model ( env ); // adding constraints cout << " Adding constraints...\n"; IloArray expr_sumMappedToJL ( env, pj.size() ); for ( IloInt j = 0; j < pj.size(); j++ ) { expr_sumMappedToJL[j] = IloExprArray ( env, pj.at ( j ) ); for ( IloInt l = 0; l < pj.at ( j ); l++ ) { if ( ( outlier_detection ) && ( j == outlier_vector_id ) ) { // No constraints for output type size needed. } else { index_mapping JL; JL.x=j; JL.y=l; if ( invalidJL.find ( JL ) == invalidJL.end() ) { // output type l of pattern vector j may be used expr_sumMappedToJL[j][l] = IloExpr ( env ); for ( S_indices::iterator i=C[j][l].begin(); i!=C[j][l].end(); i++ ) { expr_sumMappedToJL[j][l] += xIJ_ILP[ ( *i )][j]; } // constraint (1) model.add ( expr_sumMappedToJL[j][l] <= n_ILP - uJL_ILP[j][l] * n_ILP ); // constraint (2) model.add ( expr_sumMappedToJL[j][l] + k_ILP * uJL_ILP[j][l] >= k_ILP ); } else { // output type l of pattern vector j will never be used model.add ( uJL_ILP[j][l] == 0 ); } } } } // constraint (3) for ( IloInt i = 0; i < ni.size(); i++ ) { model.add ( IloSum ( xIJ_ILP[i] ) == ni_ILP[i] ); } // rows from type i will never be mapped to pattern vector j for ( S_indexpairs::iterator ij=invalidIJ.begin(); ij!=invalidIJ.end(); ij++ ) { index_mapping const& invalidIJ= ( *ij ); model.add ( xIJ_ILP[invalidIJ.x][invalidIJ.y] == 0 ); } // building up objective function (constraint (4)) cout << " Defining objective function...\n"; for ( IloInt i = 0; i < ni.size(); i++ ) { for ( IloInt j = 0; j < pj.size(); j++ ) { profit += xIJ_ILP[i][j] * w_ILP[j]; } } model.add ( IloMinimize ( env, profit ) ); cout << " Solving ILP...\n"; IloCplex cplex ( model ); cplex.setOut ( env.getNullStream() ); success = cplex.solve(); if ( success ) { // cout << " Objective value: " << cplex.getObjValue() << "\n"; // cout << " Variable values: \n"; // for(IloInt i=0; i 0 ) { index_mapping ij; ij.y = j; ij.x = i; xIJmap.insert ( M_rowtype_patternvector::value_type ( ij, mappingcount ) ); } } } } } catch ( IloException& ex ) { cerr << "Error: " << ex << endl; } catch ( ... ) { cerr << "Error" << endl; } env.end(); return success; } void printPatternVector ( patternvector& p, ostream& o ) { o << "("; for ( patternvector::iterator i = p.begin(); i != p.end(); i++ ) { if ( i != p.begin() ) { o << ","; } if ( *i ) { o << "_"; } else { o << "*"; } } o << ")\n"; } void printPatternVectors ( V_patternvectors& P, ostream& o ) { for ( V_patternvectors::iterator i = P.begin(); i != P.end(); i++ ) { patternvector currentPV = *i; printPatternVector ( currentPV, o ); } } void printPatternVector ( patternvector& p, int cost, ostream& o ) { o << "("; for ( patternvector::iterator i = p.begin(); i != p.end(); i++ ) { if ( i != p.begin() ) { o << ","; } if ( *i ) { o << "_"; } else { o << "*"; } } o << ") - costs=" << cost << "\n"; } void printPatternVectors ( V_patternvectors& P, V_costs& w, ostream& o ) { for ( int i = 0; i < P.size(); i++ ) { printPatternVector ( P.at ( i ), w.at ( i ), o ); } } int generate_patternvectors ( V_patternvectors& P, V_costs& w, int m ) { for ( int i=0; i< ( 1< 0 ) && ( Pset.find ( currentPV ) == Pset.end() ) ) { Pset.insert ( currentPV ); //cout << "\n Inserted "; //printPatternVector ( currentPV, cout ); P.push_back ( currentPV ); w.push_back ( currentCost ); if ( is_outlier_vector ) { outlier_vector_id=vector_id; } vector_id++; } } } } void printRowType ( rowtype& rowtype, M_alphabet& Sigma, ostream& o ) { o << rowtype.number << "x: "; for ( int j = 0; j < rowtype.prototype.size(); j++ ) { if ( j > 0 ) o << ","; int& symbol = rowtype.prototype[j]; intsymboliterator symbolInSigma = Sigma.by().find ( symbol ); if ( symbolInSigma != Sigma.by().end() ) { o << ( *symbolInSigma ).second; } else { cerr << "Symbol " << symbol << "not found.\n"; } } o << "\n"; } void printRowTypes ( V_rowtypes& T, M_alphabet& Sigma, ostream& o ) { for ( int i = 0; i < T.size(); i++ ) { printRowType ( T[i], Sigma, o ); } } void printRowType ( rowtype& rowtype, ostream& o ) { o << rowtype.number << "x: "; for ( int j = 0; j < rowtype.prototype.size(); j++ ) { if ( j > 0 ) o << ","; o << rowtype.prototype[j]; } o << "\n"; } void printRowTypes ( V_rowtypes& T, ostream& o ) { for ( int i = 0; i < T.size(); i++ ) { printRowType ( T[i], o ); } } int readRowTypes ( V_rowtypes& T, M_alphabet& Sigma, int& n, V_numbers& ni, int& m, string filename ) { ifstream F ( filename.data() ); if ( !F.is_open() ) { cerr << "Could not open input file: " << filename << "\n"; return -1; } else { M_row_index RowTypeIndices; while ( !F.eof() ) { string line; row currentRow; getline ( F, line ); CSVsep sep ( line ); for ( CSVsep::iterator i = sep.begin(); i != sep.end(); i++ ) { int symbolnumber; string symbol = *i; strsymboliterator currentStrSymbolInSigma = Sigma.by().find ( symbol ); if ( currentStrSymbolInSigma != Sigma.by().end() ) { symbolnumber = ( *currentStrSymbolInSigma ).second; } else { symbolnumber = Sigma.by().size(); Sigma.insert ( symbol_mapping ( symbol, symbolnumber ) ); } currentRow.push_back ( symbolnumber ); } M_row_index::iterator currentRowTypeNumber = RowTypeIndices.find ( currentRow ); if ( currentRow.size() ) { if ( currentRowTypeNumber == RowTypeIndices.end() ) { rowtype newRowType; newRowType.number = 1; newRowType.prototype = currentRow; RowTypeIndices[currentRow] = T.size(); T.push_back ( newRowType ); } else { T[ ( *currentRowTypeNumber ).second].number++; } } } } n = 0; m = 0; if ( T.size() > 0 ) { for ( int i = 0; i < T.size(); i++ ) { int current_ni = T.at ( i ).number; ni.push_back ( current_ni ); n = n + current_ni; } m = T.at ( 0 ).prototype.size(); } } void print_outputtype ( row const& outputrow, M_alphabet& Sigma, ostream& o ) { for ( int x = 0; x < outputrow.size(); x++ ) { if ( x > 0 ) o << ","; if ( outputrow.at ( x ) == -1 ) { o << "*"; } else { int symbol = outputrow.at ( x ); intsymboliterator symbolInSigma = Sigma.by().find ( symbol ); if ( symbolInSigma != Sigma.by().end() ) { o << ( *symbolInSigma ).second; } else { cerr << "Symbol " << symbol << "not found.\n"; } } } } void print_outputtype ( row const& outputrow, ostream& o ) { for ( int x = 0; x < outputrow.size(); x++ ) { if ( x > 0 ) o << ","; if ( outputrow.at ( x ) == -1 ) { o << "*"; } else { o << outputrow[x]; } } } void print_outputtype ( row const& outputrow, S_indices compatibleRow_ids, ostream& o ) { o << "("; print_outputtype ( outputrow, o ); o << ")"; o << " - compatible for row types" << ": "; for ( S_indices::iterator x = compatibleRow_ids.begin(); x != compatibleRow_ids.end(); x++ ) { if ( x != compatibleRow_ids.begin() ) o << ","; o << ( *x ); } o << "\n"; } void print_outputtypes ( VV_rows& Tout_possJL, S_indexpairs& InvalidJL, VV_S_indices& C, ostream& o, int outlier_vector_id ) { for ( int j = 0; j < C.size(); j++ ) { for ( int l = 0; l < C[j].size(); l++ ) { index_mapping JL; JL.x=j; JL.y=l; if ( InvalidJL.find ( JL ) == InvalidJL.end() ) { row const& current_prototype=Tout_possJL[j][l]; if ( j != outlier_vector_id ) { print_outputtype ( current_prototype, C[j][l], o ); } else { o << "("; print_outputtype ( current_prototype, o ); o << ") - OUTLIER VECTOR, compatible with all rows\n"; } } // else // { // row const& current_prototype=Tout_possJL[j][l]; // o << "("; // print_outputtype ( current_prototype, o ); // o << ") - NOT POSSIBLE, can't get k rows\n"; // } } } } void compute_outputtypes ( V_rowtypes& T, V_patternvectors& P, VV_S_indices& C, S_indexpairs& InvalidMappings, S_indexpairs& InvalidConstraints, V_numbers& pi, int k, VV_rows& Tout_possJL ) { for ( int j = 0; j < P.size(); j++ ) { patternvector& pvJ = P.at ( j ); // storing indices of already existing output types M_outputtype_index outputtype_indices; int num_outputtypes_pvJ = 0; vector C_; vector Tout_possJ; for ( int i = 0; i < T.size(); i++ ) { //cout << "\n R" << i << "/" << T.size(); row& rowtypeI = T.at ( i ).prototype; // compute compatible PVinstance for pattern vector i and row tzpe jj row prototype; vectorc__; for ( int x = 0; x < rowtypeI.size(); x++ ) { if ( pvJ.at ( x ) ) { prototype.push_back ( rowtypeI.at ( x ) ); } else { prototype.push_back ( -1 ); } } // search for PVinstance in map M_outputtype_index::iterator compatible_outputtype_id = outputtype_indices.find ( prototype ); if ( compatible_outputtype_id == outputtype_indices.end() ) { // this is a new output type Tout_possJ.push_back ( prototype ); outputtype_indices.insert ( M_outputtype_index::value_type ( prototype, num_outputtypes_pvJ++ ) ); S_indices compRowIndices; compRowIndices.insert ( i ); C_.push_back ( compRowIndices ); } else { // output type already exists, thus add row type index S_indices& comp = C_[ ( *compatible_outputtype_id ).second]; comp.insert ( i ); } } Tout_possJL.push_back ( Tout_possJ ); // Important: Mark all mapping from row type i to pattern vector j as invalid, if the maximum number of mapped row to the // corresponding output type cannot exceed k for ( int l=0; l < num_outputtypes_pvJ; l++ ) { S_indices& comp=C_[l]; int maxMappedRows=0; for ( S_indices::iterator cIt=comp.begin(); cIt!=comp.end(); cIt++ ) { maxMappedRows += T[*cIt].number; } if ( maxMappedRowssecond; avg_rowtypesize+=currentRowTypeSize; if ( max_rowtypesize ( &k )->default_value ( 5 ), "degree k of anonymity (default=5)" ) ( "pattern-vectors,p", po::value()->composing(), "pattern vectors file" ) ( "outlier-detection,o", "also allow few completely suppressed rows" ) ; po::options_description hidden ( "Hidden options" ); hidden.add_options() ( "input-file", po::value(), "input file" ) ; po::options_description cmdline_options; cmdline_options.add ( options ).add ( hidden ); po::positional_options_description p; p.add ( "input-file", -1 ); po::variables_map vm; store ( po::command_line_parser ( ac, av ). options ( cmdline_options ).positional ( p ).run(), vm ); notify ( vm ); if ( ( vm.count ( "help" ) ) || ( (!vm.count ( "input-file" )) && (!vm.count ( "version" )) ) ) { cout << "Usage: patternILP [options] input-file\n"; cout << options << "\n"; return 0; } if ( vm.count ( "version" ) ) { cout << "Pattern Guided k-Anonymity solver by ILP, version 0.2.1\n"; return 0; } if ( vm.count ( "outlier-detection" ) ) { cout << "Outerlier detection enabled: Allow to completely suppress (even less than k) rows.\n"; outlier_detection=true; } if ( vm.count ( "pattern-vectors" ) ) { patternvectorFilename = vm["pattern-vectors"].as(); cout << "pattern vectors file: " << patternvectorFilename << "\n"; } if ( vm.count ( "input-file" ) ) { inputFilename = vm["input-file"].as(); cout << "input file: " << inputFilename << "\n"; } cout << "degree k of anonymity: " << k << "\n"; } catch ( exception& e ) { cout << e.what() << "\n"; return 1; } double timeoffset = 0.000; timeval start, end; double cpu_time_used; gettimeofday ( &start, 0 ); double time1=0.0, tstart; // time measurment variables tstart = clock(); // start if ( inputFilename.size() ) { readRowTypes ( T, Sigma, n, ni, m, inputFilename ); } else { cerr << "ERROR: No input file specified!\n"; return 2; } // Read pattern vectors file if ( patternvectorFilename.size() ) { readPatternVectors ( P, w, patternvectorFilename, outlier_vector_id ); } // Generate all pattern vectors else { generate_patternvectors ( P, w, m ); } if ( T.size() < 1 ) { cerr << "ERROR: Data matrix is empty!\n"; return 3; } if ( P.size() < 1 ) { cerr << "ERROR: No pattern vectors specified!\n"; return 3; } if ( P[0].size() != T[0].prototype.size() ) { cerr << "ERROR: Pattern vectors and data matrix are not compatible!\n"; return 3; } cout << "\n Printing original row types:\n"; printRowTypes ( T,Sigma,cout ); cout << "\n Printing row types (simplified):\n"; printRowTypes ( T,cout ); cout << "\n Printing pattern vectors:\n"; printPatternVectors ( P, w, cout ); cout << "\n Computing possible output types.\n"; compute_outputtypes ( T, P, C, InvalidMappings, InvalidConstraints, pi, k, Tout_possJL ); cout << "\n Printing possible output types:\n"; print_outputtypes ( Tout_possJL, InvalidConstraints, C, cout, outlier_vector_id ); cout << "\n Generating ILP... \n"; bool exit_with_errorcode=false; if ( computeSolution ( C, InvalidMappings, InvalidConstraints, w, n, ni, pi, m, k, xIJmap, totalcosts, outlier_detection, outlier_vector_id ) ) { cout << "\n An optimal solution was found:\n"; compute_outputblocks ( xIJmap, T, P, Tout ); cout << "\n Anonymized data (simplified):\n"; print_solution ( Tout, cout ); cout << "\n Anonymized data (raw):\n"; print_solution ( Tout, Sigma,cout ); cout << "\n Total costs: " << totalcosts << "\n"; compute_quality_measures ( Tout,number_rowtypes,avg_rowtypesize,max_rowtypesize,sum_squared_rowtypesizes ); cout << " Number of output row types: " << number_rowtypes << "\n"; cout << " Average row type size: " << avg_rowtypesize << "\n"; cout << " Maximum row type size: " << max_rowtypesize << "\n"; cout << " Sum of squared row type sizes: " << sum_squared_rowtypesizes << "\n"; } else { cout << "\n No solution was found.\n"; exit_with_errorcode=true; } time1 += clock() - tstart; // end time1 = time1/CLOCKS_PER_SEC; // rescale to seconds gettimeofday ( &end, 0 ); cpu_time_used = end.tv_sec - start.tv_sec + ( end.tv_usec - start.tv_usec ) / 1000000.; cout << " Running time = " << ( cpu_time_used + timeoffset ) << " sec." << endl; cout << " CPU-time (single core) = " << time1 << " sec." << endl; if ( exit_with_errorcode ) { return -1; } else { return 0; } }