''' Search for a pattern in a binary file. If located, the offset(s) will be printed out. Examples (assumes bash-like command line and program aliased to bgrep): bgrep -s "\\d+" file Show all numbers and their offsets in the file. bgrep -s "0x[\\dA-Fa-f]+" file Show all hexadecimal numbers and their offsets in the file. bgrep "\\r\\n" file Show the offsets of the carriage return/linefeed pairs. bgrep -s "\\d{1,2}/\\d{1,2}/\\d\\d" file Show the offsets of all dates in the file of the form n1/n1/n2 where n1's are one or two digits numbers and n2 is a two digit number. --------------------------------------------------------------------------- Copyright (C) 2009 Don Peterson This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ''' import sys, getopt, re out = sys.stdout.write nl = "\n" # Command line option settings plain_text = False # -f ignore_case = False # -i just_show_file_matches = False # -l hex_to_binary = False # -o show_match = False # -s manual = '''%s [options] pattern [file1...] Do a pattern search in binary files and print the zero-based offset of the located pattern. Offsets will be printed in decimal, hex, and percentage. If no files are given, stdin is searched. Python regular expressions are used in pattern, not grep-style patterns. If you use the -o option, the pattern must just consist of pairs of hex digits; for example: "6865 6c 6c 6f". An space characters will be removed, then each pair of hex characters will be converted to a byte. Options: -f Plain text match (i.e., don't use regular expressions) -i Ignore case -l Just print files that have one or more matches -o Convert the hex digits in the pattern to a binary string -s Include the matched binary string in the printout (not with -f) Some python regular expression special characters: \\b Empty string at beginning or end of word \B Empty string not at beginning or end of word. \d Any decimal digit == [0-9] \D Any non decimal digit == [^0-9] \s Any whitespace character == [ \\t\\n\\r\\f\\v] \S Any nonwhitespace character == [^ \\t\\n\\r\\f\\v] \w Any alphanumeric character \W Any non-alphanumeric character \Z Matches only at the end of the string ''' % sys.argv[0] def Usage(): name = sys.argv[0] out(manual) sys.exit(0) def Error(msg): sys.stderr.write(msg + nl) exit(1) def ParseCommandLine(): if len(sys.argv) < 2: Usage() try: optlist, args = getopt.getopt(sys.argv[1:], "fhilos") except getopt.GetoptError, str: msg, option = str out(msg + nl) sys.exit(1) for opt in optlist: if opt[0] == "-f": global plain_text plain_text = True if opt[0] == "-h": out(manual) if opt[0] == "-i": global ignore_case ignore_case = True if opt[0] == "-l": global just_show_file_matches just_show_file_matches = True if opt[0] == "-o": global hex_to_binary hex_to_binary = True if opt[0] == "-s": global show_match show_match = True if len(args) < 1: Usage() return args def PrintOffsets(string, file, offsets, match_objects): if not offsets: return indent = "" file_size = len(string) if file: s = " [%d (0x%x) bytes]" % (file_size, file_size) out(file + s + nl) indent = " " if not match_objects: for offset in offsets: out(indent) out( "%12d " % offset) s = "0x%x" % offset out( " %12s " % s) out( " %6.2f%% " % (100*(offset + 1.)/file_size)) out(nl) else: for offset, mo in zip(offsets, match_objects): out(indent) out( "%12d " % offset) s = "0x%x" % offset out( " %12s " % s) out( " %6.2f%% " % (100*(offset + 1.)/file_size)) if show_match and match_objects: out(string[mo.start():mo.end()]) out(nl) def ProcessString(pattern, string): '''If there are matches, return the offsets (as decimal numbers) in a list. An empty list means no matches. ''' offsets = [] match_objects = [] L = len(string) if plain_text: start = 0 while start < L: offset = string.find(pattern, start) if offset != -1: offsets.append(offset) start = offset + 1 if just_show_file_matches: break else: start = L else: if ignore_case: reg = re.compile(pattern, re.I) else: reg = re.compile(pattern) start = 0 while start < L: mo = reg.search(string, start) if mo: offsets.append(mo.start()) match_objects.append(mo) start = mo.end() if just_show_file_matches: break else: start = L return offsets, match_objects def ConvertToBinary(pattern): d = {"0":0, "1":1, "2": 2, "3":3, "4":4, "5":5, "6":6 , "7":7, "8":8, "9":9, "a":10, "b":11, "c":12, "d":13, "e":14, "f":15} L = len(pattern) if not L: Error("Empty hex pattern not allowed") if L % 2 != 0: Error("Empty hex pattern not allowed") pattern = pattern.lower() pattern = pattern.replace(" ", "") s = "" L = len(pattern) for i in xrange(L): if pattern[i] not in d: Error("'%s' is not a valid hex digit" % pattern[i]) for i in xrange(0, L, 2): s += chr(d[pattern[i]]*16 + d[pattern[i+1]]) return s def main(): args = ParseCommandLine() pattern = args[0] if hex_to_binary: pattern = ConvertToBinary(pattern) global plain_text plain_text = True if len(args) == 1: string = sys.stdin.read() offsets, match_objects = ProcessString(pattern, string) PrintOffsets(string, "stdin", offsets, match_objects) else: for file in args[1:]: string = open(file, "rb").read() offsets, match_objects = ProcessString(pattern, string) if just_show_file_matches: if offsets: out(file + nl) else: PrintOffsets(string, file, offsets, match_objects) main()