c#-SimHash匹配相似-算法

更新时间：2022-09-16 18:16:28

使用场景：Google 的 simhash 算法

//通过大量测试，simhash用于比较大文本，比如500字以上效果都还蛮好，距离小于3的基本都是相似，误判率也比较低。
 
//从我的经验，如果我们假定N是每个块的大小，M是重叠的字符的数目，N = 4和M = 3是***的选择

public class SimHashAnalyser : IAnalyser
{
 
    private const int HashSize = 32;
 
    public float GetLikenessValue(string needle, string haystack)

    {

        var needleSimHash = this.DoCalculateSimHash(needle);

        var hayStackSimHash = this.DoCalculateSimHash(haystack);

        return (HashSize - GetHammingDistance(needleSimHash, hayStackSimHash)) / (float)HashSize;

    }
 
    private static IEnumerable<int> DoHashTokens(IEnumerable<string> tokens)

    {

        var hashedTokens = new List<int>();

        foreach (string token in tokens)

        {

            hashedTokens.Add(token.GetHashCode());

        }

        return hashedTokens;

    }
 
    private static int GetHammingDistance(int firstValue, int secondValue)

    {

        var hammingBits = firstValue ^ secondValue;

        var hammingValue = 0;

        for (int i = 0; i < 32; i++)

        {

            if (IsBitSet(hammingBits, i))

            {

                hammingValue += 1;

            }

        }

        return hammingValue;

    }
 
    private static bool IsBitSet(int b, int pos)

    {

        return (b & (1 << pos)) != 0;

    }
 
    private int DoCalculateSimHash(string input)

    {

        ITokeniser tokeniser = new OverlappingStringTokeniser(4, 3);

        var hashedtokens = DoHashTokens(tokeniser.Tokenise(input));

        var vector = new int[HashSize];

        for (var i = 0; i < HashSize; i++)

        {

            vector[i] = 0;

        }
 
        foreach (var value in hashedtokens)

        {

            for (var j = 0; j < HashSize; j++)

            {

                if (IsBitSet(value, j))

                {

                    vector[j] += 1;

                }

                else

                {

                    vector[j] -= 1;

                }

            }

        }
 
        var fingerprint = 0;

        for (var i = 0; i < HashSize; i++)

        {

            if (vector[i] > 0)

            {

                fingerprint += 1 << i;

            }

        }

        return fingerprint;

    }
 
}
 
public interface IAnalyser
{

    float GetLikenessValue(string needle, string haystack);
}
 
public interface ITokeniser
{

    IEnumerable<string> Tokenise(string input);
}
 
public class FixedSizeStringTokeniser : ITokeniser
{

    private readonly ushort tokensize = 5;

    public FixedSizeStringTokeniser(ushort tokenSize)

    {

        if (tokenSize < 2 || tokenSize > 127)

        {

            throw new ArgumentException("Token 不能超出范围");

        }

        this.tokensize = tokenSize;

    }
 
    public IEnumerable<string> Tokenise(string input)

    {

        var chunks = new List<string>();

        int offset = 0;

        while (offset < input.Length)

        {

            chunks.Add(new string(input.Skip(offset).Take(this.tokensize).ToArray()));

            offset += this.tokensize;

        }

        return chunks;

    }
 
}
 
public class OverlappingStringTokeniser : ITokeniser
{

    private readonly ushort chunkSize = 4;

    private readonly ushort overlapSize = 3;
 
    public OverlappingStringTokeniser(ushort chunkSize, ushort overlapSize)

    {

        if (chunkSize <= overlapSize)

        {

            throw new ArgumentException("Chunck 必须大于 overlap");

        }

        this.overlapSize = overlapSize;

        this.chunkSize = chunkSize;

    }
 
    public IEnumerable<string> Tokenise(string input)

    {

        var result = new List<string>();

        int position = 0;

        while (position < input.Length - this.chunkSize)

        {

            result.Add(input.Substring(position, this.chunkSize));

            position += this.chunkSize - this.overlapSize;

        }

        return result;

    }
 
}

使用：

const string HayStack = "中国***………………";

const string Needle = "中国*** 2013………………";
 
IAnalyser analyser = new SimHashAnalyser();

var likeness = analyser.GetLikenessValue(Needle, HayStack);
 
Console.Clear();

Console.WriteLine("Likeness: {0}%", likeness * 100);
Console.ReadKey();

SimHash for c#

本文转自曾祥展博客园博客，原文链接：http://www.cnblogs.com/zengxiangzhan/p/3311114.html，如需转载请自行联系原作者

上一篇 : ：安装sql2012 需要安装net3.5 没有的话安装不成功下一篇 : Visual C++ 时尚编程百例019(串行化)

c#-SimHash匹配相似-算法

相关阅读

推荐文章