HPCToolkit
MergeDataFiles.cpp
Go to the documentation of this file.
1 // -*-Mode: C++;-*-
2 
3 // * BeginRiceCopyright *****************************************************
4 //
5 // $HeadURL: https://hpctoolkit.googlecode.com/svn/branches/hpctoolkit-hpcserver/src/tool/hpcserver/MergeDataFiles.cpp $
6 // $Id: MergeDataFiles.cpp 4333 2013-07-31 01:00:13Z felipet1326@gmail.com $
7 //
8 // --------------------------------------------------------------------------
9 // Part of HPCToolkit (hpctoolkit.org)
10 //
11 // Information about sources of support for research and development of
12 // HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
13 // --------------------------------------------------------------------------
14 //
15 // Copyright ((c)) 2002-2019, Rice University
16 // All rights reserved.
17 //
18 // Redistribution and use in source and binary forms, with or without
19 // modification, are permitted provided that the following conditions are
20 // met:
21 //
22 // * Redistributions of source code must retain the above copyright
23 // notice, this list of conditions and the following disclaimer.
24 //
25 // * Redistributions in binary form must reproduce the above copyright
26 // notice, this list of conditions and the following disclaimer in the
27 // documentation and/or other materials provided with the distribution.
28 //
29 // * Neither the name of Rice University (RICE) nor the names of its
30 // contributors may be used to endorse or promote products derived from
31 // this software without specific prior written permission.
32 //
33 // This software is provided by RICE and contributors "as is" and any
34 // express or implied warranties, including, but not limited to, the
35 // implied warranties of merchantability and fitness for a particular
36 // purpose are disclaimed. In no event shall RICE or contributors be
37 // liable for any direct, indirect, incidental, special, exemplary, or
38 // consequential damages (including, but not limited to, procurement of
39 // substitute goods or services; loss of use, data, or profits; or
40 // business interruption) however caused and on any theory of liability,
41 // whether in contract, strict liability, or tort (including negligence
42 // or otherwise) arising in any way out of the use of this software, even
43 // if advised of the possibility of such damage.
44 //
45 // ******************************************************* EndRiceCopyright *
46 
47 //***************************************************************************
48 //
49 // File:
50 // $HeadURL: https://hpctoolkit.googlecode.com/svn/branches/hpctoolkit-hpcserver/src/tool/hpcserver/MergeDataFiles.cpp $
51 //
52 // Purpose:
53 // Merges databases from the many files to a megatrace (.mt) file. Rarely used
54 // and will be obsolete soon.
55 //
56 // Description:
57 // [The set of functions, macros, etc. defined in the file]
58 //
59 //***************************************************************************
60 
61 #include "MergeDataFiles.hpp"
62 #include "ByteUtilities.hpp"
63 #include "Constants.hpp"
64 #include "FileUtils.hpp"
65 #include "DebugUtils.hpp"
66 #include "ProgressBar.hpp"
67 
68 #include <string>
69 #include <algorithm>
70 #include <cstdlib>
71 #include <cstdio>
72 #include <sstream>
73 
74 using namespace std;
75 typedef int64_t Long;
76 namespace TraceviewerServer
77 {
78  MergeDataAttribute MergeDataFiles::merge(string directory, string globInputFile,
79  string outputFile)
80  {
81  int lastDot = globInputFile.find_last_of('.');
82  string suffix = globInputFile.substr(lastDot);
83 
84  DEBUGCOUT(2) << "Checking to see if " << outputFile << " exists" << endl;
85 
86 
87  if (FileUtils::exists(outputFile))
88  {
89 
90  DEBUGCOUT(2) << "Exists" << endl;
91 
92  if (isMergedFileCorrect(&outputFile))
94  // the file exists but corrupted.
95  cout << "Database file may be corrupted. Continuing" << endl;
96  return STATUS_UNKNOWN;
97  //remove(OutputFile.string().c_str());
98  }
99 
100  DEBUGCOUT(2) << "Doesn't exist" << endl;
101  // check if the files in glob patterns is correct
102 
103  if (!atLeastOneValidFile(directory))
104  {
105  return FAIL_NO_DATA;
106  }
107 
108  DataOutputFileStream dos(outputFile.c_str());
109 
110  //-----------------------------------------------------
111  // 1. write the header:
112  // int type (0: unknown, 1: mpi, 2: openmp, 3: hybrid, ...
113  // int num_files
114  //-----------------------------------------------------
115 
116  int type = 0;
117  dos.writeInt(type);
118 
119  vector<string> allPaths = FileUtils::getAllFilesInDir(directory);
120  vector<string> filteredFileNames;
121  vector<string>::iterator it;
122  for (it = allPaths.begin(); it != allPaths.end(); it++)
123  {
124  string val = *it;
125  if (val.find(".hpctrace") < string::npos)//This is hardcoded, which isn't great but will have to do because GlobInputFile is regex-style ("*.hpctrace")
126  filteredFileNames.push_back(val);
127  }
128  // on linux, we have to sort the files
129  //To sort them, we need a random access iterator, which means we need to load all of them into a vector
130  sort(filteredFileNames.begin(), filteredFileNames.end());
131 
132  dos.writeInt(filteredFileNames.size());
133  const Long num_metric_header = 2 * SIZEOF_INT; // type of app (4 bytes) + num procs (4 bytes)
134  Long num_metric_index = filteredFileNames.size()
135  * (SIZEOF_LONG + 2 * SIZEOF_INT);
136  FileOffset currentOffset = num_metric_header + num_metric_index;
137 
138  int name_format = 0; // FIXME hack:some hpcprof revisions have different format name !!
139  //-----------------------------------------------------
140  // 2. Record the process ID, thread ID and the currentOffset
141  // It will also detect if the application is mp, mt, or hybrid
142  // no accelator is supported
143  // for all files:
144  // int proc-id, int thread-id, long currentOffset
145  //-----------------------------------------------------
146  vector<string>::iterator it2;
147  for (it2 = filteredFileNames.begin(); it2 < filteredFileNames.end(); it2++)
148  {
149 
150  string Filename = *it2;
151  int last_pos_basic_name = Filename.length() - suffix.length();
152  string Basic_name = Filename.substr(FileUtils::combinePaths(directory, "").length(),//This ensures we count the "/" at the end of the path
153  last_pos_basic_name);
154 
155  vector<string> tokens = splitString(Basic_name, '-');
156 
157 
158  int num_tokens = tokens.size();
159  if (num_tokens < PROC_POS)
160  // if it is wrong file with the right extension, we skip
161  continue;
162  int proc;
163  string Token_To_Parse = tokens[name_format + num_tokens - PROC_POS];
164  proc = atoi(Token_To_Parse.c_str());
165  if ((proc == 0) && (!FileUtils::stringActuallyZero(Token_To_Parse)))
166  {
167  // old version of name format
168  name_format = 1;
169  string Token_To_Parse = tokens[name_format + num_tokens - PROC_POS];
170  proc = atoi(Token_To_Parse.c_str());
171  }
172  dos.writeInt(proc);
173  if (proc != 0)
174  type |= MULTI_PROCESSES;
175  int Thread = atoi(tokens[name_format + num_tokens - THREAD_POS].c_str());
176  dos.writeInt(Thread);
177  if (Thread != 0)
178  type |= MULTI_THREADING;
179  dos.writeLong(currentOffset);
180  currentOffset += FileUtils::getFileSize(Filename);
181  }
182  //-----------------------------------------------------
183  // 3. Copy all data from the multiple files into one file
184  //-----------------------------------------------------
185  ProgressBar prog("Merging database", filteredFileNames.size());
186  for (it2 = filteredFileNames.begin(); it2 < filteredFileNames.end(); it2++)
187  {
188  string i = *it2;
189 
190  ifstream dis(i.c_str(), ios_base::binary | ios_base::in);
191  char data[PAGE_SIZE_GUESS];
192  dis.read(data, PAGE_SIZE_GUESS);
193  int bytesRead = dis.gcount();
194  while (bytesRead > 0)
195  {
196  dos.write(data, bytesRead);
197  dis.read(data, PAGE_SIZE_GUESS);
198  bytesRead = dis.gcount();
199  }
200  dis.close();
201  prog.incrementProgress();
202  }
203  insertMarker(&dos);
204  dos.close();
205  //-----------------------------------------------------
206  // 4. FIXME: write the type of the application
207  // the type of the application is computed in step 2
208  // Ideally, this step has to be in the beginning !
209  //-----------------------------------------------------
210  //While we don't actually want to do any input operations, adding the input flag prevents the file from being truncated to 0 bytes
211  DataOutputFileStream f(outputFile.c_str(), ios_base::in | ios_base::out | ios_base::binary);
212  f.writeInt(type);
213  f.close();
214 
215  //-----------------------------------------------------
216  // 5. remove old files
217  //-----------------------------------------------------
218  removeFiles(filteredFileNames);
219  return SUCCESS_MERGED;
220  }
221 
222 
223 
224  void MergeDataFiles::insertMarker(DataOutputFileStream* dos)
225  {
226  dos->writeLong(MARKER_END_MERGED_FILE);
227  }
228  bool MergeDataFiles::isMergedFileCorrect(string* filename)
229  {
230  ifstream f(filename->c_str(), ios_base::binary | ios_base::in);
231  bool isCorrect = false;
232  Long pos = FileUtils::getFileSize(*filename) - SIZEOF_LONG;
233 
234  if (pos > 0)
235  {
236  f.seekg(pos, ios_base::beg);
237  char buffer[8];
238  f.read(buffer, 8);
239  uint64_t marker = ByteUtilities::readLong(buffer);
240 
241  isCorrect = (marker==MARKER_END_MERGED_FILE);
242  }
243  f.close();
244  return isCorrect;
245  }
246  bool MergeDataFiles::removeFiles(vector<string> vect)
247  {
248  bool success = true;
249  vector<string>::iterator it;
250  for (it = vect.begin(); it != vect.end(); ++it)
251  {
252  bool thisSuccess = (remove(it->c_str()) == 0);
253  success &= thisSuccess;
254  }
255  return success;
256  }
257  bool MergeDataFiles::atLeastOneValidFile(string dir)
258  {
259  vector<string> FileList = FileUtils::getAllFilesInDir(dir);
260  vector<string>::iterator it;
261  for (it = FileList.begin(); it != FileList.end(); ++it)
262  {
263  string filename = *it;
264 
265  unsigned int l = filename.length();
266  //if it ends with ".hpctrace", we are good.
267  string ending = ".hpctrace";
268  if (l < ending.length())
269  continue;
270  string supposedext = filename.substr(l - ending.length());
271 
272  if (ending == supposedext)
273  {
274  return true;
275  }
276  }
277  return false;
278  }
279  //From http://stackoverflow.com/questions/236129/splitting-a-string-in-c
280  vector<string> MergeDataFiles::splitString(string toSplit, char delimiter)
281  {
282  vector<string> toReturn;
283  stringstream ss(toSplit);
284  string item;
285  while (getline(ss, item, delimiter)) {
286  toReturn.push_back(item);
287  }
288  return toReturn;
289  }
290 
291 } /* namespace TraceviewerServer */
#define SIZEOF_LONG
Definition: Constants.hpp:66
#define SIZEOF_INT
Definition: Constants.hpp:67
int64_t Long
uint64_t FileOffset
Definition: FileUtils.hpp:78
void incrementProgress(ulong tasks)
Definition: ProgressBar.cpp:86
#define DEBUGCOUT(a)
Definition: DebugUtils.hpp:72