//-------------------------------------------------- // aftermulti.c - version 1.00 22.4.98 // (c) Eugene Prosso 1998 // The program corrects the first-order excess/lack of noise/signal in SRL // spherse, caused by the difference in "true" and "seen" clearity. // ------------------------------------------------ //-------------------------------------------------- // input file format - as in mutli.c //-------------------------------------------------- //-------------------------------------------------- // first one must run SRL on MC itself. Then the obtained // column with probabilities must be added to MC file // as the LAST column. Then run afterSRL. //-------------------------------------------------- #include "skiplist.h" #include "afterSRL.h" #include #include void ReadInput ( char* InputFileName, char *DataName, char* MCName, char *OutName, int &nnns, int nns[], int &flag, int &idt ) ; void CheckInput ( char* DataName, char* MCName, int &ndata, int &nMC, int &d ) ; void ReadArrays ( char* DataName, char* MCName, double** &Data, double** &MC, int ndata, int nMC, int d ); void Forker ( double **Data, double **MC, char *OutName, int ndata, int nMC, int d, int nnns, int nns[], int flag, int idt ); void FindNeighbours ( double** Data, double** MC, char * OutName, int ndata1, int ndata2, int nMC, int d, int nnns, int nns[], int flag, int SoughtIdt ) ; inline double EucliDistance ( double *vect1, double *vect2, int d ) ; inline void CalcProbability ( LinkedList &list, int SoughtIdt, int nnns, int nns[], int flag, double probability[] ) ; void panic (char* str) ; inline INT (double val) { return val < 0 ? (int) (val - 0.5) : int (val + 0.5) ; } // ---------------------- // MAIN // ---------------------- int main (int argc, char* argv[]) { int idt; /* idt being sought */ int d; /* number of variables */ int ndata, nMC ; /* number of lines in data and MC */ char* DataName = new char[30]; // data file name */ char* MCName = new char[30]; // MC file name */ char* OutName = new char[30]; // output file name */ double **Data, **MC; int nns[20]; // number of neighbours to consider int nnns; // total number of different nns int flag; if (argc < 2) panic ("neighbours ") ; ReadInput ( argv[1], // reads command-line arguments DataName, MCName, OutName, nnns, nns, flag, idt ); CheckInput ( DataName, MCName, // half-checks compatibility ndata, nMC, d ); // of data & MC files ReadArrays ( DataName, MCName, Data, MC, ndata, nMC, d ); Forker ( Data, MC, OutName, ndata, nMC, d, nnns, nns, flag, idt) ; return 1; } // forks the process into m, each void Forker (double **Data, double **MC, char *OutName, int ndata, int nMC, int d, int nnns, int nns[], int flag, int idt ) { char *command = new char[200]; char *OutName1 = new char[200]; if (flag == 1 || flag == 0) { FindNeighbours ( Data, MC, OutName, // finds nns[i] nearest 0, ndata, nMC, d, // MCnbrs for each data pt., nnns, nns, flag, idt ); // prints results in OF[i] return; } pid_t pid; int ndata1, ndata2; int m = 0; int status; int waited = 0; for (int counter = 1 ; counter <= flag ; counter++) { if ((pid = fork()) < 0) panic ("Unable to fork. Exit"); if (pid == 0) { ndata1 = (ndata * (counter-1)) / flag ; ndata2 = (ndata * counter) / flag ; sprintf (OutName1, "%s%d", OutName, m+1); FindNeighbours ( Data, MC, OutName1, ndata1, ndata2, nMC, d, nnns, nns, flag, idt ); exit (0); } m++ ; if (m == 2) { m = 0 ; for (int cnnt = 0; cnnt <2 ; cnnt++) wait (&status); waited += 2; for ( int cnt1 = 0 ; cnt1 < nnns ; cnt1++ ) { sprintf (command, "cat ") ; for ( int cnt2 = 1; cnt2 <= 2 ; cnt2++ ) sprintf (command, "%s%s%d.%d ", command, OutName, cnt2, nns[cnt1] ); sprintf (command, "%s >> %s.%d", command, OutName, nns[cnt1]); system (command); } } } for (counter = 0 ; counter < flag-waited ; counter++ ) if (pid > 0) wait (&status); for ( int cnt1 = 0 ; cnt1 < nnns ; cnt1++ ) { sprintf (command, "cat ") ; for ( int cnt2 = 1; cnt2 <= flag-waited ; cnt2++ ) sprintf (command, "%s%s%d.%d ", command, OutName, cnt2, nns[cnt1]); sprintf (command, "%s >> %s.%d", command, OutName, nns[cnt1]); system (command); } for ( cnt1 = 1 ; cnt1 < 3 ; cnt1++ ) { sprintf (command, "rm %s%d*", OutName, cnt1); system (command); } } /* Transforms program arguments into corresponding variables */ void ReadInput ( char *FileName, char *DataName, char* MCName, char *OutName, int &nnns, int nns[], int &flag, int &idt ) { ifstream InFile ; InFile.open (FileName, ios::nocreate); if (InFile.fail () ) panic ("No input file" ); InFile >> DataName ; InFile >> MCName; InFile >> OutName; InFile >> idt ; InFile >> flag ; // alfa = sqrt(noise)/signal OR signal/total InFile >> nnns; for (int cnt1 = 0; cnt1 < nnns; cnt1++ ) InFile >> nns[cnt1]; InFile.close(); } /* 1. compares number of columns in MC and data files. (#column(MC) - #coulmn(data) should be 1 ) 2. counts number of variables and number of points in data and MC */ void CheckInput ( char* DataName, char* MCName, int &ndata, int &nMC, int &d ) { int nwords, nlines; double DataNcolumns, MCNcolumns; char *com = (char*) malloc (512); // command ifstream dummy; int cnt ; /* counter */ for (cnt = 0 ; cnt++ <= 1; ) { if (cnt == 1) sprintf (com, "wc %s > dummyfile", DataName); else sprintf (com, "wc %s > dummyfile", MCName); system (com); dummy.open ("dummyfile", ios::nocreate); dummy >> nlines >> nwords; if (cnt == 1) { DataNcolumns = (double) nwords / nlines ; ndata = nlines ; } else { MCNcolumns = (double) nwords / nlines ; nMC = nlines ; } dummy.close(); } if ( fabs ( DataNcolumns - INT (DataNcolumns) > 0.0001 ) || fabs ( MCNcolumns - INT (MCNcolumns + 0.5) > 0.0001 ) || INT (MCNcolumns) - INT (DataNcolumns) != 2 ) panic ("Bad input files or unmatched number of variables\n") ; d = INT (DataNcolumns) ; system ("rm dummyfile"); } // ------------------------------------------------------------------------ // in the simplest and most naive version of this program // this function just reads the data and MC files // INTO RAM!!! // ------------------------------------------------------------------------ void ReadArrays ( char* DataName, char* MCName, double** &Data, double** &MC, int ndata, int nMC, int d ) { ifstream DataFile, MCFile; long cnt1; DataFile.open (DataName, ios::nocreate); MCFile.open (MCName, ios::nocreate); Data = new double* [ndata]; MC = new double* [nMC]; // reading from data file to data array for ( cnt1 = 0 ; cnt1 < ndata; cnt1++ ) { Data[cnt1] = new double [d]; for (int cnt2 = 0; cnt2 < d; cnt2++ ) { DataFile >> Data[cnt1][cnt2]; if ( DataFile.eof() ) panic ( "Unexpected data EOF in FindNeighbours" ); } } // reading from MC file to MC array for ( cnt1 = 0 ; cnt1 < nMC; cnt1++ ) { MC[cnt1] = new double [d+2]; for (int cnt2 = 0; cnt2 <= d+1; cnt2++ ) { MCFile >> MC[cnt1][cnt2]; if (MCFile.eof()) panic( "Unexpected MC EOF in FindNeighbours\n"); } } DataFile.close (); MCFile.close (); } // ------------------------------------------------------------------------ // The major procedure - finding nn-s for each point & // printing the results. // Complexity = O (D * M * d) // D - # of data points, M - # of MC points, d - dimensionality // ------------------------------------------------------------------------ void FindNeighbours ( double **Data, double **MC, char *OutName, int ndata1, int ndata2, int nMC, int d, int nnns, int nns[], int flag, int SoughtIdt ) { LinkedList nearest (nns[nnns-1]); // nn l. list for ifstream DataFile, MCFile; ofstream OutputFile[200]; int cnt1, cnt2, cnt; // counters double prob[200]; // returned probability sosed TempNeighbour; char* OName = new char[100]; if (nnns > 200) panic (" Too much output nn-s" ) ; for (cnt = 0; cnt < nnns ; cnt++ ) { sprintf (OName, "%s.%d", OutName, nns[cnt]) ; OutputFile[cnt].open (OName); } for (cnt1 = ndata1 ; cnt1 < ndata2; cnt1++ ) // loop on all data point { for (cnt2 = 0; cnt2 < nMC; cnt2++) // loop on all MC points { TempNeighbour.distance = EucliDistance (Data[cnt1], MC[cnt2], d); // trying to put TempNbr. on list if ( cnt2 < nns[nnns-1] || TempNeighbour.distance < nearest.MaxKey ) { TempNeighbour.id = INT (MC[cnt2][d]) ; TempNeighbour.ownprob = MC[cnt2][d+1] ; nearest.Insert (TempNeighbour, TempNeighbour.distance); } } CalcProbability (nearest, SoughtIdt, nnns, nns, 0, prob) ; CalcProbability (nearest, SoughtIdt, nnns, nns, 1, prob) ; for (cnt = 0; cnt < nnns ; cnt++ ) OutputFile[cnt] << prob[cnt] << endl; // writing output nearest.Emptify (); } for (cnt = 0; cnt < nnns ; cnt++ ) OutputFile[cnt].close (); } // gets distance between 2 linked lists of doubles; # of coords is chosen to // be the minimal of 2 inline double EucliDistance ( double* vect1, double* vect2, int d ) { double SquareSum = 0 ; for ( int c = 0 ; c < d ; c++ ) SquareSum += (vect1[c]-vect2[c]) * (vect1[c]-vect2[c]) ; return sqrt (SquareSum); } // calculating probability for a data point to belong to chosen group // prob = (number of nn with chosen ID) / (total number of nn) inline void CalcProbability ( LinkedList &list, int SoughtIdt, int nnns, int nns[], int flag, double probability[]) { sosed ANeighb; int cnt1, cnt2 = 0 ; char *com = new char[50]; double ngood = 0; if ( ! list.Start () ) panic ("Empty list in CalcProbability\n"); for (cnt1 = 0; cnt1 < nnns; cnt1++ ) { while (cnt2++ < nns[cnt1]) { ANeighb = list.GetValue() ; if ( ! list.Forward () ) panic ("Bad list in CalcProbability"); if (flag) ngood += ANeighb.ownprob; else if ( abs (ANeighb.id) == SoughtIdt ) ngood += 1; } probability[cnt1] = ngood / (double) nns[cnt1]; } if (!flag) return; double pos_purity, neg_purity; int n_pos_good=0, n_pos_bad=0, n_neg_good=0, n_neg_bad=0; cnt2 = 0 ; list.Start(); for (cnt1 = 0; cnt1 < nnns; cnt1++ ) { while (cnt2++ < nns[cnt1]) { ANeighb = list.GetValue() ; if ( ! list.Forward () ) panic ("Bad list in CalcProbability"); if (ANeighb.ownprob > probability[cnt1]) { if (abs (ANeighb.id) == SoughtIdt) n_pos_good++; else n_pos_bad++; } else { if (abs (ANeighb.id) == SoughtIdt) n_neg_good++; else n_neg_bad++; } } pos_purity = (double) n_pos_good/(n_pos_good + n_pos_bad) ; neg_purity = (double) n_neg_good/(n_neg_good + n_neg_bad) ; probability [cnt1] = pos_purity; } } // panic void panic (char* str) { cerr << str << endl ; exit (0); }