# -*- coding: utf-8 -*-
"""
binaryornot.helpers
-------------------
Helper utilities used by BinaryOrNot.
"""
import chardet
import logging
logger = logging.getLogger(__name__)
def print_as_hex(s):
"""
Print a string
as hex bytes.
"""
print(
":".join(
"{0:x}".format(ord(c))
for c
in s))
def get_starting_chunk(filename, length=1024):
"""
:param filename: File to open
and get the first little chunk of.
:param length: Number of bytes to read, default 1024.
:returns: Starting chunk of bytes.
"""
# Ensure we open the file in binary mode
try:
with open(filename,
'rb')
as f:
chunk = f.read(length)
return chunk
except IOError
as e:
print(e)
_control_chars = b
'\n\r\t\f\b'
if bytes
is str:
# Python 2 means we need to invoke chr() explicitly
_printable_ascii = _control_chars + b
''.join(map(chr, range(32, 127)))
_printable_high_ascii = b
''.join(map(chr, range(127, 256)))
else:
# Python 3 means bytes accepts integer input directly
_printable_ascii = _control_chars + bytes(range(32, 127))
_printable_high_ascii = bytes(range(127, 256))
def is_binary_string(bytes_to_check):
"""
Uses a simplified version of the Perl detection algorithm,
based roughly on Eli Bendersky
's translation to Python:
http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/
This
is biased slightly more
in favour of deeming files
as text
files than the Perl algorithm, since all ASCII compatible character
sets are accepted
as text,
not just utf-8.
:param bytes: A chunk of bytes to check.
:returns:
True if appears to be a binary, otherwise
False.
"""
# Empty files are considered text files
if not bytes_to_check:
return False
# Now check for a high percentage of ASCII control characters
# Binary if control chars are > 30% of the string
low_chars = bytes_to_check.translate(
None, _printable_ascii)
nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check))
logger.debug(
'nontext_ratio1: %(nontext_ratio1)r', locals())
# and check for a low percentage of high ASCII characters:
# Binary if high ASCII chars are < 5% of the string
# From: https://en.wikipedia.org/wiki/UTF-8
# If the bytes are random, the chances of a byte with the high bit set
# starting a valid UTF-8 character is only 6.64%. The chances of finding 7
# of these without finding an invalid sequence is actually lower than the
# chance of the first three bytes randomly being the UTF-8 BOM.
high_chars = bytes_to_check.translate(
None, _printable_high_ascii)
nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check))
logger.debug(
'nontext_ratio2: %(nontext_ratio2)r', locals())
is_likely_binary = (
(nontext_ratio1 > 0.3
and nontext_ratio2 < 0.05)
or
(nontext_ratio1 > 0.8
and nontext_ratio2 > 0.8)
)
logger.debug(
'is_likely_binary: %(is_likely_binary)r', locals())
# then check for binary for possible encoding detection with chardet
detected_encoding = chardet.detect(bytes_to_check)
logger.debug(
'detected_encoding: %(detected_encoding)r', locals())
# finally use all the check to decide binary or text
decodable_as_unicode =
False
if (detected_encoding[
'confidence'] > 0.9
and
detected_encoding[
'encoding'] !=
'ascii'):
try:
try:
bytes_to_check.decode(encoding=detected_encoding[
'encoding'])
except TypeError:
# happens only on Python 2.6
unicode(bytes_to_check, encoding=detected_encoding[
'encoding'])
# noqa
decodable_as_unicode =
True
logger.debug(
'success: decodable_as_unicode: '
'%(decodable_as_unicode)r', locals())
except LookupError:
logger.debug(
'failure: could not look up encoding %(encoding)s',
detected_encoding)
except UnicodeDecodeError:
logger.debug(
'failure: decodable_as_unicode: '
'%(decodable_as_unicode)r', locals())
logger.debug(
'failure: decodable_as_unicode: '
'%(decodable_as_unicode)r', locals())
if is_likely_binary:
if decodable_as_unicode:
return False
else:
return True
else:
if decodable_as_unicode:
return False
else:
if b
'\x00' in bytes_to_check
or b
'\xff' in bytes_to_check:
# Check for NULL bytes last
logger.debug(
'has nulls:' + repr(b
'\x00' in bytes_to_check))
return True
return False