piros
/
XPIR


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
							/* Copyright (C) 2014 Carlos Aguilar Melchor, Joris Barrier, Marc-Olivier Killijian
 * This file is part of XPIR.
 *
 *  XPIR is free software: you can redistribute it and/or modify
 *	it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  XPIR is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with XPIR.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "DBDirectoryProcessor.hpp"

/************************************************/
/* Default constructor : no splitting           */
/*  -> 1 input file -> 1 output stream          */
/************************************************/
DBDirectoryProcessor::DBDirectoryProcessor() : filesSplitting(false) {
	// TODO(feature) deal with sub-directories in the database !

	directory=std::string(DEFAULT_DIR_NAME);
	maxFileBytesize=0;

	// Create the pool of ifstream
	for(int i=0;i<NB_FILE_DESCRIPTORS;i++)
		fdPool.push_back(new std::ifstream());

	// Then create the catalog and get the filenumbers and size
	DIR *dir = opendir (directory.c_str());
	struct dirent *ent = nullptr;

	// If there is no error when opening the directory
	if (dir != NULL) 
	{
		uint32_t i=0;
		// For each entry
		while ( (ent = readdir (dir)) != NULL) 
		{
			// Ignore files . and ..
			if (strcmp(ent->d_name, ".") > 0 && strcmp(ent->d_name, ".."))
			{
				// Count processed files (one out of 2**7?)
				if ((i << 25)==0) std::cout << "DBDirectoryProcessor: " << i+1 << " entries processed\r" << std::flush;i++;
				// Add File object on the file list
				std::string fileName= std::string( ent->d_name );
				file_list.push_back( fileName );
				uint64_t fileSize = getFileSize(directory + fileName);
				if (fileSize > maxFileBytesize)
					maxFileBytesize = fileSize;		
			}
		}
		std::cout << "DBDirectoryProcessor: " << i << " entries processed" << std::endl;
		closedir (dir);
	}
	else // If there was a problem opening the directory
	{
		std::cout << "DBDirectoryProcessor: Error opening database directory" << std::endl;
	}

	std::cout << "DBDirectoryProcessor: The size of the database is " << maxFileBytesize*file_list.size() << " bytes" << std::endl;
	std::cout << "DBDirectoryProcessor: The number of elements in the catalog is " << file_list.size() << std::endl;
}

// This constructor is called when we need File-splitting
DBDirectoryProcessor::DBDirectoryProcessor(uint64_t nbStreams) : filesSplitting(true) {

	directory=std::string(DEFAULT_DIR_NAME);
	maxFileBytesize=0;

	// Create the pool of ifstream
	for(int i=0;i<NB_FILE_DESCRIPTORS;i++)
		fdPool.push_back(new std::ifstream());

	// Then create the catalog and get the filenumbers and size
	DIR *dir = opendir (directory.c_str());
	struct dirent *ent = nullptr;

	// If there is no error when opening the directory
	if (dir != NULL) 
	{
		ent = readdir (dir);
		// WARNING: In case of file-splitting, we deal only with the first file
		// On some filesystems, the dir contains also special files such as "." and "..", skip them
		while (ent->d_name == NULL || ent->d_type != DT_REG) {
			ent = readdir (dir);
		}

		// Add File object on the file list
		std::string fileName=directory + std::string( ent->d_name );
		realFileName=fileName;
		uint64_t realFileSize = getFileSize(realFileName);				
		maxFileBytesize = realFileSize/nbStreams;

		if(maxFileBytesize==0) {
			std::cout << "DBDirectoryProcessor: ERROR cannot split a file en less than one byte elements!" << std::endl;
			std::cout << "DBDirectoryProcessor: file " << realFileName << " is only "<< realFileSize << " long" << std::endl;
			exit(1);
		}

		closedir (dir);
		for(int i=0;i<nbStreams;i++) {
			file_list.push_back( std::to_string(i) );
		}
	}
	else // If there was a problem opening the directory
	{
		std::cout << "DBDirectoryProcessor: Error when opening directory " <<directory<< std::endl;
		exit(1);
	}

#ifdef DEBUG
	std::cout << "maxFileBytesize." <<maxFileBytesize<< std::endl;
	std::cout << "file_list.size()." <<file_list.size()<< std::endl;

#endif
	std::cout << "DBDirectoryProcessor: The size of the database is " << maxFileBytesize*file_list.size() << " bytes" << std::endl;
	std::cout << "DBDirectoryProcessor: The number of elements in the catalog is " << file_list.size() << std::endl;
}

DBDirectoryProcessor::~DBDirectoryProcessor() {
	for (auto ifs : fdPool) delete ifs; 
}

std::string DBDirectoryProcessor::getCatalog(const bool typeOfCatalog) {
	std::string buf;
	directory=std::string(DEFAULT_DIR_NAME);
	if(typeOfCatalog) {
		// Start with the number of elements in the catalog
		buf = std::to_string((unsigned int)0)+ "\n";	
		buf += std::to_string(getNbStream())+ "\n";		
		// Then for each file contactenate (with newlines) filename and filesize
		for (auto f : file_list)
		{
			if(!filesSplitting) {
				buf += f + "\n" + std::to_string(getFileSize(directory+f)) + "\n";
			} else {
				buf += f + "\n" + std::to_string(getmaxFileBytesize()) + "\n";
			}
		}
		return buf;
	} 
	// else we want a compact representation, i.e. nbFiles / fileSize
	buf = std::to_string((unsigned int)1)+ "\n";	
	buf += std::to_string(getNbStream())+ "\n";
	buf += std::to_string(maxFileBytesize)+ "\n";
	return buf;
}

uint64_t DBDirectoryProcessor::getDBSizeBits() {
	return maxFileBytesize*file_list.size()*8;
}
uint64_t DBDirectoryProcessor::getNbStream() {
	return  file_list.size();
}
uint64_t DBDirectoryProcessor::getmaxFileBytesize() {
	return maxFileBytesize;
}

std::ifstream* DBDirectoryProcessor::openStream(uint64_t streamNb, uint64_t requested_offset) {
	std::string local_directory(DEFAULT_DIR_NAME);

	std::ifstream* is = fdPool.back();
	fdPool.pop_back();
	// When there is no splitting, each ifstream is associated with a real file 
	// (at least when no aggregation is done which is the case for now)
	if(!filesSplitting) {
		is->open( local_directory + file_list[streamNb], std::ios::binary );
		is->seekg(requested_offset);
	} else {
		// But when we are doing file splitting, we just need to position the ifstream at the correct position
		uint64_t splitting_offset=streamNb*getmaxFileBytesize();
		is->open( realFileName, std::ios::binary );
		is->seekg(splitting_offset + requested_offset);
	}
	return is;
}

uint64_t DBDirectoryProcessor::readStream(std::ifstream* s, char * buf, uint64_t size) {
	uint64_t sizeRead=0;
	//std::cout << "sizeRead = "<<sizeRead<<" size = "<<size<<std::endl;
	while(sizeRead<size) {
		uint64_t readThisrun=s->readsome(buf+sizeRead,size-sizeRead);
		sizeRead+=readThisrun;
		// Check if we need to pad
		if(readThisrun==0 && sizeRead<size) {
			//		std::cout << "padding = "<<size-sizeRead<<std::endl;
			bzero(buf+sizeRead,size-sizeRead);
			sizeRead=size;
		}
	}
	return size;
}

void DBDirectoryProcessor::closeStream(std::ifstream* s) {
	s->close();
	fdPool.push_back(s);
}

std::streampos DBDirectoryProcessor::getFileSize( std::string filePath ){
	std::streampos fsize = 0;
	std::ifstream file( filePath.c_str(), std::ios::binary );
	fsize = file.tellg();
	file.seekg( 0, std::ios::end );
	fsize = file.tellg() - fsize;
	file.close();
	return fsize;
}

void DBDirectoryProcessor::readAggregatedStream(uint64_t streamNb, uint64_t alpha, uint64_t offset, uint64_t bytes_per_file, char* rawBits){
	uint64_t fileByteSize = std::min(bytes_per_file, getmaxFileBytesize()-offset);
	uint64_t startStream = streamNb*alpha;
	uint64_t endStream = std::min(streamNb*alpha + alpha - 1, getNbStream() - 1);
	uint64_t paddingStreams = (streamNb*alpha+alpha) >= getNbStream() ? (streamNb*alpha+alpha) - getNbStream() : 0;

  #pragma omp critical
	{	
		for (int i=startStream; i <= endStream; i++)
		{
			std::ifstream *stream = openStream(i, offset);

			// Just read the file (plus padding for that file)
			readStream(stream, rawBits + (i % alpha) * fileByteSize, fileByteSize);

			closeStream(stream);
		} 

		if(paddingStreams !=0)
		{
			bzero(rawBits + (endStream % alpha) * fileByteSize, fileByteSize*paddingStreams);
		}
	}
}