Apache SINGA
A distributed deep learning platform .
 All Classes Namespaces Files Functions Variables Typedefs Enumerator Macros
data_shard.h
1 #ifndef INCLUDE_UTILS_SHARD_H_
2 #define INCLUDE_UTILS_SHARD_H_
3 
4 #include <google/protobuf/message.h>
5 #include <fstream>
6 #include <string>
7 #include <unordered_set>
8 
9 
10 using google::protobuf::Message;
11 
12 namespace singa {
13 
33 class DataShard {
34  public:
35  enum {
37  kRead=0,
41  kAppend=2
42  };
43 
44  public:
52  DataShard(std::string folder, char mode, int capacity=104857600);
53  ~DataShard();
54 
62  bool Next(std::string *key, Message* val);
70  bool Next(std::string *key, std::string* val);
71 
78  bool Insert(const std::string& key, const Message& tuple);
85  bool Insert(const std::string& key, const std::string& tuple);
90  void SeekToFirst();
95  void Flush() ;
100  const int Count();
104  const std::string path(){
105  return path_;
106  }
107 
108  protected:
114  int Next(std::string *key);
121  int PrepareForAppend(std::string path);
126  bool PrepareNextField(int size);
127 
128  private:
129  char mode_;
130  std::string path_;
131  // either ifstream or ofstream
132  std::fstream fdat_;
133  // to avoid replicated record
134  std::unordered_set<std::string> keys_;
135  // internal buffer
136  char* buf_;
137  // offset inside the buf_
138  int offset_;
139  // allocated bytes for the buf_
140  int capacity_;
141  // bytes in buf_, used in reading
142  int bufsize_;
143 };
144 } /* singa */
145 #endif // INCLUDE_UTILS_SHARD_H_
bool Insert(const std::string &key, const Message &tuple)
Append one tuple to the shard.
read only mode used in training
Definition: data_shard.h:37
Data shard stores training/validation/test tuples.
Definition: data_shard.h:33
int PrepareForAppend(std::string path)
Setup the disk pointer to the right position for append in case that the pervious write crashes...
const std::string path()
Definition: data_shard.h:104
bool Next(std::string *key, Message *val)
read next tuple from the shard.
bool PrepareNextField(int size)
Read data from disk if the current data in the buffer is not a full field.
const int Count()
Iterate through all tuples to get the num of all tuples.
void Flush()
Flush buffered data to disk.
append mode, e.g. used when previous creating crashes
Definition: data_shard.h:39
void SeekToFirst()
Move the read pointer to the head of the shard file.
DataShard(std::string folder, char mode, int capacity=104857600)
Init the shard obj.