commit 60a06cd190b8e18b61fc37982afdba6eeca07ef3 from: vincent.delft date: Sun Feb 17 17:43:10 2019 UTC for version 0.4: add some more comments inside the core correct some logging behaviour add man page add parameters verifications commit - e2af75a1cee6c654ec4c43d95353a31676cdd2a1 commit + 60a06cd190b8e18b61fc37982afdba6eeca07ef3 blob - 89c830eb82347f3bfdb2750bedca4d5f8ccc7d24 blob + f96b33c96ac6045c2d96b4b4eba6ace39dc25fdf --- CHANGELOG +++ CHANGELOG @@ -1,3 +1,8 @@ +Feb 17 2019 - 0.4 + add some more comments inside the core + correct some logging behaviour + add man page + add parameters verifications Oct 21 2018 - 0.3 Add a params table where global parameters are stored Oct 20 2018 - 0.3 blob - 4fdb44f9c4e1511914672377ffdc26c13efab707 blob + 0b50e9f42fd60116f3e6a48748115be5bd67659f --- TODO +++ TODO @@ -1 +1,3 @@ tests, tests and tests + + blob - e04ecedf7600e59bdba1bbaff667e3de831c2676 blob + 45a6222f6b752b22c69e8dfc40ac643c95025c23 --- yabitrot +++ yabitrot @@ -4,21 +4,27 @@ """ Author : Vincent -Version : 0.3 +Version : 0.4 Licence : BSD Require : python >= 3.6 use sqlite3 DB embedded with python package Developed on: OpenBSD -Tested on : OpenBSD, Windows +Tested on : OpenBSD 6.4, Windows 10, osx 10.14 Description : This tool allow you to calculate a checksum for each files in the target folder Those values are stored in an sqlite DB at the root of your targetted folder This program use INODE as key instead of filename, so it can manage hardlinks Since that, the scrip does never go outside the targetted filesystem it works on openBSD, but should work on any systems (OSX, Windows and Linux) + Typically, you must perform a first scan of the folder you want: + yabitrot -p + Then, you can re-scan your folder and yabitrot will compare check sums + with what we can find in te DB + yabitrot -p + /* - * Copyright (c) 2018 Vincent Delft + * Copyright (c) 2018 Vincent Delft * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -51,9 +57,12 @@ BATCHID = time.time() VERBOSE = 0 LOGFILE = "" DRY_RUN = False +DB_FILE_NAME = ".cksum.db" +BIGFILE_TTS = 10 # number of seconds after which the chksum informs (if verbose) that he is working on a big file def log(text): + """put text in logfile (if provided), or on std output.""" if LOGFILE: tts = time.strftime("%c", time.localtime()) fid = open(LOGFILE, 'a') @@ -65,6 +74,7 @@ def log(text): def print_err(text): + """put text in logfile (if provided) or to std error.""" if LOGFILE: tts = time.strftime("%c", time.localtime()) fid = open(LOGFILE, 'a') @@ -74,7 +84,8 @@ def print_err(text): sys.stderr.flush() -def get_stats(path, osstats, status, chunk_size=DEFAULT_CHUNK_SIZE): +def get_cksum(path, osstats, status, chunk_size=DEFAULT_CHUNK_SIZE): + """return the value that the DB is expecting: cksum, mtime.""" localtts = time.time() crc = 0 stats = {'crc': None, 'mtime': None} @@ -85,7 +96,8 @@ def get_stats(path, osstats, status, chunk_size=DEFAUL while d: crc += zlib.adler32(d) d = f.read(chunk_size) - if VERBOSE > 0 and time.time() - localtts > 10: + if VERBOSE > 0 and time.time() - localtts > BIGFILE_TTS: + # for pur information, we log the big consumers log("big file: %s, inode: %s, size: %.2f MB, %s" % (status, osstats.st_ino, osstats.st_size / 1024 / 1024, path)) localtts = time.time() except OSError as ex: @@ -97,7 +109,38 @@ def get_stats(path, osstats, status, chunk_size=DEFAUL return stats +def get_osstats(fpath, excludes): + """return the os.stat of the selected file.""" + to_skip = False + for excl_patt in excludes: + if fnmatch.fnmatch(fpath, excl_patt): + to_skip = True + if to_skip: + if VERBOSE > 0: + log("Based on exclude rules, we skip: %s" % fpath) + return None + try: + osstats = os.stat(fpath) + except OSError as ex: + if ex.errno in [errno.EACCES, errno.EOPNOTSUPP, errno.ENOENT]: + log("os.stat fails for: %s" % fpath) + return None + else: + raise + if not stat.S_ISREG(osstats.st_mode): + if VERBOSE > 0: + log("Not a regular file: %s" % fpath) + return None + return osstats + + class CRCDB: + """The DB class with 2 tables: cksum and params. + cksum is the main DB where we store indes and associated chckesums + params is a key-pair values table. Currently we store: + rootpath: the path from where we perform an anlysis + filesystem id: the id of the targeted filesystem + """ def __init__(self, fpathname, commitlimit=30): self.counter = 0 self.tts = time.time() @@ -176,12 +219,14 @@ class CRCDB: def set_param(self, param, value): if not DRY_RUN: - print("para update") - self.cur.execute("""INSERT INTO params VALUES (?, ?) - ON CONFLICT(param) DO UPDATE SET value=? WHERE param=? - """, (param, value, value, param)) + # print("param update", param, value) + self.cur.execute('INSERT OR REPLACE into params VALUES (?, ?)', (param, value)) self.commit() + def get_param(self): + self.cur.execute('SELECT * from params') + return self.cur.fetchall() + def close(self): self.conn.commit() self.conn.close() @@ -204,95 +249,100 @@ class CRCDB: def analyze(rootpath, excludes=[]): - dbpath = os.path.join(rootpath, ".cksum.db") + """ananlyze rootath and all sub-folders. + if a DB exists, it compare the checksum of the associated inode with what we have in the DB for this inode + if a DB does not exists it store the checksum associated to the inode + """ + dbpath = os.path.join(rootpath, DB_FILE_NAME) DB = CRCDB(dbpath, COMMIT_LIMIT) log("DB stored on: %s" % (dbpath)) - excludes.append('.cksum.db') + excludes.append('*/%s' % DB_FILE_NAME) + excludes.append('*/%s-journal' % DB_FILE_NAME) counter = 0 counter_added = 0 counter_update = 0 counter_biterror = 0 total_size = 0 + parameters = DB.get_param() filesystemid = os.stat(rootpath).st_dev log("Device ID:%s" % filesystemid) - DB.set_param("rootpath",os.path.join(os.getcwd(), rootpath)) - DB.set_param("filesystemid", filesystemid) + if parameters: + for param_name, value in parameters: + if param_name == 'rootpath' and value != rootpath: + print_err("We have detected a DB at %s" % dbpath) + print_err("This DB has been created with the path: %s" %value) + print_err("But, you have entered the following path: %s" % rootpath) + return -1 + if param_name == "filesystemid" and value != str(filesystemid): + print_err("We have detected a DB at %s" % dbpath) + print_err("This DB has been created with the filesystemID:%s" % filesystemid) + print_err("But, currently the filesystem ID is: %s" % filesystemid) + return -1 + else: + DB.set_param("rootpath", rootpath) + DB.set_param("filesystemid", filesystemid) analyze_tts = time.time() for path, dummy, files in os.walk(rootpath): for elem in files: - to_skip = False - for excl_patt in excludes: - if fnmatch.fnmatch(elem, excl_patt): - to_skip = True fpath = os.path.join(path, elem) - if to_skip: - if VERBOSE > 0: - log("Based on exclude rules, we skip: %s" % fpath) + osstats = get_osstats(fpath, excludes) + if not osstats or osstats.st_dev != filesystemid: + # print("skip:%s" % fpath) continue if VERBOSE > 1 and time.time() - analyze_tts > COMMIT_LIMIT: log("working with:", fpath) analyze_tts = time.time() - try: - osstats = os.stat(fpath) - except OSError as ex: - if ex.errno in [errno.EACCES, errno.EOPNOTSUPP, errno.ENOENT]: - osstats = None - else: - raise - if osstats is None: - log("os.stat fails for: %s" % fpath) - continue - if osstats.st_dev != filesystemid: - continue - if not stat.S_ISREG(osstats.st_mode): - if VERBOSE > 0: - log("Not a regular file: %s" % fpath) - continue + # print("process:%s" % fpath) counter += 1 db_rec = DB.get_rec(osstats.st_ino) - stats = None + cksum = None if db_rec is None: - stats = get_stats(fpath, osstats, "new") - DB.add_rec(osstats.st_ino, stats) + cksum = get_cksum(fpath, osstats, "new") + DB.add_rec(osstats.st_ino, cksum) counter_added += 1 else: if db_rec[2] != BATCHID: - stats = get_stats(fpath, osstats, "update") + cksum = get_cksum(fpath, osstats, "update") if db_rec[0] != osstats.st_mtime: - DB.update_rec(osstats.st_ino, stats) + DB.update_rec(osstats.st_ino, cksum) counter_update += 1 - elif db_rec[1] != stats['crc']: + elif db_rec[1] != cksum['crc']: log("bit ERROR for file %s" % (fpath)) log(" Previous:") - log(" scan was on %s" % time.strftime("%c", time.localtime(db_rec[0]))) + log(" scan was on %s" % time.strftime("%c", time.localtime(db_rec[2]))) log(" checksum was: %s" % db_rec[1]) + log(" mtime was: %s" % time.strftime("%c", time.localtime(db_rec[0]))) log(" Current:") log(" scan on %s" % time.strftime("%c", time.localtime(BATCHID))) - log(" checksum is: %s" % stats['crc']) + log(" checksum is: %s" % cksum['crc']) + log(" mtime is: %s" % time.strftime("%c", time.localtime(osstats.st_mtime))) counter_biterror += 1 - if stats and stats['crc']: + if cksum and cksum['crc']: total_size += osstats.st_size log("\n") DB.cleanup() records = DB.count() DB.close() - print_err("%s files added" % counter_added) - print_err("%s files updates" % counter_update) - print_err("%s files error" % counter_biterror) - print_err("%s files analysed in %.2f sec, %.3f GB" % (counter, time.time() - BATCHID, total_size / 1024 / 1024 / 1024)) - print_err("%s entries in the DB" % records) + log("%s files added" % counter_added) + log("%s files updates" % counter_update) + log("%s files error" % counter_biterror) + log("%s files analysed in %.2f sec, %.3f GB" % (counter, time.time() - BATCHID, total_size / 1024 / 1024 / 1024)) + log("%s entries in the DB" % records) if os.name == 'posix' and not DRY_RUN: os.chmod(dbpath, stat.S_IRUSR | stat.S_IWUSR) os.chown(dbpath, os.getuid(), os.getgid()) if counter_biterror > 0: - print_err("Several bit error, plese check the log file") + print_err("Several bit error, please check the log file") sys.exit(counter_biterror) -def force_db(fpath, rootpath, excludes = []): - dbpath = os.path.join(rootpath, ".cksum.db") + +def force_db(fpath, rootpath, excludes=[]): + """This udate the DB record for this inode""" + dbpath = os.path.join(rootpath, DB_FILE_NAME) DB = CRCDB(dbpath, COMMIT_LIMIT) log("DB stored on: %s" % (dbpath)) - excludes.append('.cksum.db') + excludes.append('*/%s' % DB_FILE_NAME) + excludes.append('*/%s-journal' % DB_FILE_NAME) filesystemid = os.stat(rootpath).st_dev log("Device ID:%s" % filesystemid) to_skip = False @@ -314,12 +364,13 @@ def force_db(fpath, rootpath, excludes = []): if osstats is None: log("os.stat fails for: %s" % fpath) return 2 - stats = get_stats(fpath, osstats, "update") + stats = get_cksum(fpath, osstats, "update") DB.update_rec(osstats.st_ino, stats) log("checkcum calculated and stored in the DB") DB.close() return 0 + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( blob - /dev/null blob + 0439c4d04c561568baa1ad1b5e2c20185259fd48 (mode 644) --- /dev/null +++ yabitrot.1 @@ -0,0 +1,114 @@ +.\" +.\" Copyright (c) 2018 Vincent Delft +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.Dd February 17 2019 +.Dt YABITROT 8 +.Os +.Sh NAME +.Nm Yabitrot +.Nd A tool that checks if your files are not impacted by a bit rotation issue +.Sh SYNOPSIS +.Nm yabitrot +.Op Fl nh +.Op Fl s Ar integer +.Op Fl c Ar integer +.Op Fl p Ar path +.Op Fl e Ar string +.Op Fl v Ar integer +.Op Fl L Ar path +.Op Fl f Ar path +.Sh DESCRIPTION +.Nm +will check if your files do not have bitrotation issue. +.Pp +.Nm +is yet an other bit rotation engine. Despite several others it works based on the inode and +not based on the file's name. As consequence yabitrot will never scan files outside the current file system. +But on the other side, if you are using hard-links, you will scan less files. +.Pp +.Nm +is using an sqlite DB to store the check sums and the associated inodes. This sqlite DB will be stored at the +root of the directory you want to analyze. If this DB is not present, yabitrot will create it. If it's present +it will compare the checksum of the files (inodes) you have on disk with the check sum strored in the DB. +If the values are not the same, you will be warned that you have some issues with one file. +.Pp +.Sh PARAMETERS +.Bl -tag -width "-s integer" +.It Fl h +Help. Display the possible parameters +.It Fl n +Dry-run. Do not modify the DB. +.It Fl s Ar integer +Size of the chunks in bytes to read files. Default is 16384 bytes +.It Fl c Ar integer +Define the frequency at which yabitrot will perform DB commits. +This value is expressed in seconds. By default it's 10 seconds. +.It Fl p Ar path +Path of the folder you want to analyze. This is also where the +Sqlite DB will be loctated. +.It Fl e Ar string +List of exceptions in a glob format. During the analyze if a file +match one of this patern, we skip the checksum of that file. +Example: *.core,*.pyc +.It Fl v Ar integer +verbose. Values can be 0, 1 or 2. Default is 0. +.It Fl L Ar path +Path name of your log file. If not defined, the log data will be +printed onthe standard output. +.It Fl f Ar path +path name of a file you want to force a recalculation of the checksum. +.El +.Sh EXIT STATUS +The exit status will have the number of bit rotation issue. So, +if no issue, the exit status will be 0. +.Sh EXAMPLES +The following trigger an analysis strting on the current folder. The logging info will be +displayed on the standard output +.Bd + $ yabitrot +.Ed +.Pp +The following trigger an analysis on the folder /mnt/sd1 where *.core and *.pyc files +are skipped. The log info will be written on /var/log/yabitrot.log. +If there is not DB found in /mnt/sd1, yabitrot will create one. Other else, +a comparison between checksums will be performed. +.Bd + $ yabitrot -p /mnt/sd1 -e "*.core,*.pyc" -L /var/log/yabitrot.log +.Ed +.Pp +The following will force a checksum recalculation and store it on the DB. The logging +information will be presented on the standard output +.Bd + $ yabitrot -p /mnt/sd1 -f /mnt/sd1/subfolder/file1 +.Ed +.Sh ERRORS +If you scan a folder from different path name, you will receive an error. Please +be consistent between each execution. +If +.Sh HISTORY +.Nm +has been created in October 2018 on +.Ox 6.4 . +.Sh AUTHOR +.Nm +was written by +.An Vincent Delft Aq Mt vincent.delft@gmail.com . +.Sh CAVEATS +.Nm +has been developped on +.Ox . +But has been tested on Windows 10 and on Mac OSX 10.14. + +