Problem
This code is to guess_file type searching its content for known signatures. It is intended to be used both from command line (searching entire media for mismatched extensions) and from code to determine and validate type of uploaded files which will be stored in a database (and eventually downloaded by someone else and possibly viewed).
One simple usage case:
var inspector = new Inspector();
inspector.Sources.Add(@"c:usersadrianodocuments*.*");
var mismatches = inspector.Inspect.Where(x => !x.ExtensionMatchesContent);
foreach (var mismatch in mismatches)
Console.WriteLine(mismatch.FileName);
For each file I have one or more InspectionResult
(in case it matches multiple types, for any reason):
public sealed class InspectionResult
{
public enum GuessQuality
{
VeryGood,
Good,
Low,
VeryLow,
}
public InspectionResult(string fileName, string description, string[] expectedExtensions,
GuessQuality quality = GuessQuality.VeryGood)
{
if (expectedExtensions == null)
throw new ArgumentNullException(nameof(expectedExtensions));
FileName = fileName ?? "";
Description = description ?? "";
Quality = quality;
string fileExtension = Path.GetExtension(fileName);
ExtensionMatchesContent = expectedExtensions.Any(x => String.Equals(x, fileExtension,
StringComparison.CurrentCultureIgnoreCase));
}
public InspectionResult(string fileName, string description, string expectedExtension,
GuessQuality quality = GuessQuality.VeryGood)
: this(fileName, description, new string[] { expectedExtension }, quality)
{
}
public string FileName { get; }
public string Description { get; }
public string MimeType
{
get
{
if (_mimeType == null)
_mimeType = ResolveMimeTypeByExtension();
return _mimeType;
}
set
{
_mimeType = value;
}
}
public GuessQuality Quality { get; }
public bool ExtensionMatchesContent
{
get;
}
private string _mimeType;
private string ResolveMimeTypeByExtension()
{
if (String.IsNullOrWhiteSpace(FileName))
return "";
return MimeMapping.GetMimeMapping(FileName);
}
}
Files to inspect are classes derived from DataSource
, currently the only supported one is file in FileDataSource
. DataSourceCollection
is a collection of them with an helper method to add multiple files using standard wildcards:
public abstract class DataSource
{
public string FileName
{
get;
protected set;
}
public abstract byte[] GetHeader(int size);
}
public sealed class DataSourceCollection : Collection<DataSource>
{
public void Add(string path)
{
string fileName = Path.GetFileName(path);
if (ContainsAnyWildcard(fileName))
{
foreach (var filePath in Directory.GetFiles(Path.GetDirectoryName(path), fileName))
Add(filePath);
}
else
{
Add(new FileDataSource(path));
}
}
private static bool ContainsAnyWildcard(string fileName)
=> fileName.IndexOfAny(new char[] { '*', '?' }) != -1;
}
public sealed class FileDataSource : DataSource
{
public FileDataSource(string path)
{
if (String.IsNullOrWhiteSpace(path))
throw new ArgumentException("Invalid file path.", nameof(path));
if (!File.Exists(path))
throw new FileNotFoundException("Cannot find the file to inspect.", path);
FileName = Path.GetFileName(path);
}
public override byte[] GetHeader(int size)
{
var buffer = new byte[size];
using (var stream = File.OpenRead(_path))
{
stream.Read(buffer, 0, size);
}
return buffer;
}
private string _path;
}
The base abstract class responsible to inspect file content to determine its type is named Sleuth
. You can handle multiple file types or declare one separate class for each one:
public abstract class Sleuth
{
public int RequiredHeaderSize
{
get;
protected set;
}
public abstract InspectionResult Inspect(string fileName, byte[] header);
protected static bool StartsWith(byte[] data, byte[] subset)
{
if (data == null || subset == null || data.Length < subset.Length)
return false;
for (int i = 0; i < subset.Length; ++i)
{
if (data[i] != subset[i])
return false;
}
return true;
}
}
With its collection:
public sealed class SleuthCollection : Collection<Sleuth>
{
public void AddFromAssembly(params Assembly[] assemblies)
{
if (assemblies == null)
throw new ArgumentNullException(nameof(assemblies));
if (assemblies.Any(x => x == null))
throw new ArgumentException("Cannot specify a null assembly reference.");
foreach (var sleuth in assemblies.SelectMany(x => FindSleuths(x)))
Add(sleuth);
}
private static IEnumerable<Sleuth> FindSleuths(Assembly assembly)
{
return assembly.GetTypes()
.Where(IsInstantiableSleuth)
.Select(x => (Sleuth)Activator.CreateInstance(x));
}
private static bool IsInstantiableSleuth(Type type)
{
if (!typeof(Sleuth).IsAssignableFrom(type))
return false;
return !type.IsAbstract
&& type.GetConstructor(Type.EmptyTypes) != null;
}
}
Users will create an instance of the Inspector
class, add data sources and collect results:
public sealed class Inspector
{
public Inspector()
{
Sources = new DataSourceCollection();
Sleuths = new SleuthCollection();
}
public DataSourceCollection Sources
{
get;
}
public SleuthCollection Sleuths
{
get;
}
public IEnumerable<InspectionResult> Inspect()
{
if (Sleuths.Count == 0)
Sleuths.AddFromAssembly(typeof(Inspector).Assembly);
return Sources
.Select(x => GuessFileType(x))
.Where(x => x != null);
}
private InspectionResult GuessFileType(DataSource source)
{
int headerSize = Math.Max(1, Sleuths.Max(x => x.RequiredHeaderSize));
var header = source.GetHeader(headerSize);
var guesses = new List<InspectionResult>();
foreach (var sleuth in Sleuths)
{
var guess = sleuth.Inspect(source.FileName, header);
if (guess == null)
continue;
if (guess.Quality == InspectionResult.GuessQuality.VeryGood)
return guess;
guesses.Add(guess);
}
return guesses
.OrderBy(x => x.Quality)
.FirstOrDefault();
}
Each sleuth is as simple as this:
public sealed class GifSleuth : Sleuth
{
public override InspectionResult Inspect(string fileName, byte[] header)
{
bool isGif87 = StartsWith(header, Gif87ASignature);
bool isGif89 = StartsWith(header, Gif89ASignature);
if (!isGif87 && !isGif89)
return null;
return new InspectionResult(fileName, Resources.GifDescription, FileExtension)
{
MimeType = MediaTypeNames.Image.Gif
};
}
private const string FileExtension = ".gif";
private static readonly byte[] Gif87ASignature = { 47, 49, 46, 38, 37, 61 };
private static readonly byte[] Gif89ASignature = { 47, 49, 46, 38, 39, 61 };
}
Of course this code is pretty repetitive then it may be contracted with some helper methods for most common cases (actually only to check for a header signature and without support for multiple matches). I’m not sure about this interface/usage but this is it:
public abstract class MultipleSleuth : Sleuth
{
public override InspectionResult Inspect(string fileName, byte[] header)
{
return GetKnownFileTypes()
.Where(x => StartsWith(header, x.Signature))
.Select(x => new InspectionResult(fileName, x.Description, x.ExpectedExtension))
.FirstOrDefault();
}
protected sealed class FileType
{
public string ExpectedExtension { get; set; }
public string Description { get; set; }
public byte[] Signature { get; set; }
}
protected abstract IEnumerable<FileType> GetKnownFileTypes();
}
Solution
FileName = fileName ?? "";
This doesn’t look right. Does an empty file name really makes sense? I think you should prevent it from being null-or-empty or add another constructor that doesn’t require it and the one that does, should not allow a null.
public void AddFromAssembly(params Assembly[] assemblies)
This method shouldn’t be part of the SleuthCollection
. It should be implemented in a factory. The collection should just store the items. Knowing how to load an assembly and even create instances is another responsibility.
After all you actually don’t need the SleuthCollection
at all because if it wasn’t a collection it meets everything to be a factory already.
There are two possibilities: either a dedicated factory class or a factory method on the Sleuth
type itself. You could use it like:
Sleuth.FromAssembly(...) // returns IEnumerable<Sleuth>
But I find a dedicated factory would be better.
public sealed class GifSleuth : Sleuth
public abstract class MultipleSleuth : Sleuth
Nested public classes should be avoided. If they don’t need an access to private members of the parent type there is not reason for them to be inside another class. A namespace like Sleuths
would be more appropriate.
public sealed class DataSourceCollection : Collection<DataSource>
This type actually stores DataSource
s but internaly it depends on a hard-coded FileDataSource
. This isn’t good. If you want to have a collection of file-data-sources then this should be the type of the collection. Also in this case a List<DataSource
would be enough and the FileDataSource
should have a factory method FileDataSource.From(string path)
. The DataSourceCollection
shouldn’t know how to create a FileDataSource
or even how to check any wildcards. It’s the responsibility of the FileDataSource
.
public enum GuessQuality
A public enum nested in another class, no no no 😉
return ""
This ""
is dangerous. Use string.Empty
in such cases.
if (_mimeType == null)
Since _mimeType
is a string
you should use string.IsNullOrEmpty
. You might be sure now that it is either null or a valid string but this is a dangerous assumption.
I think it’s very strange that the InspectionResult
is only 99% immutable. The MimeType
shouldn’t have a setter and you could initialize it like the other properties inside the construtor.