更新时间:2022-09-16 18:16:28
使用场景:Google 的 simhash 算法
//通过大量测试,simhash用于比较大文本,比如500字以上效果都还蛮好,距离小于3的基本都是相似,误判率也比较低。 //从我的经验,如果我们假定N是每个块的大小,M是重叠的字符的数目,N = 4和M = 3是***的选择 |
public class SimHashAnalyser : IAnalyser
{ private const int HashSize = 32;
public float GetLikenessValue( string needle, string haystack)
{
var needleSimHash = this .DoCalculateSimHash(needle);
var hayStackSimHash = this .DoCalculateSimHash(haystack);
return (HashSize - GetHammingDistance(needleSimHash, hayStackSimHash)) / ( float )HashSize;
}
private static IEnumerable< int > DoHashTokens(IEnumerable< string > tokens)
{
var hashedTokens = new List< int >();
foreach ( string token in tokens)
{
hashedTokens.Add(token.GetHashCode());
}
return hashedTokens;
}
private static int GetHammingDistance( int firstValue, int secondValue)
{
var hammingBits = firstValue ^ secondValue;
var hammingValue = 0;
for ( int i = 0; i < 32; i++)
{
if (IsBitSet(hammingBits, i))
{
hammingValue += 1;
}
}
return hammingValue;
}
private static bool IsBitSet( int b, int pos)
{
return (b & (1 << pos)) != 0;
}
private int DoCalculateSimHash( string input)
{
ITokeniser tokeniser = new OverlappingStringTokeniser(4, 3);
var hashedtokens = DoHashTokens(tokeniser.Tokenise(input));
var vector = new int [HashSize];
for ( var i = 0; i < HashSize; i++)
{
vector[i] = 0;
}
foreach ( var value in hashedtokens)
{
for ( var j = 0; j < HashSize; j++)
{
if (IsBitSet(value, j))
{
vector[j] += 1;
}
else
{
vector[j] -= 1;
}
}
}
var fingerprint = 0;
for ( var i = 0; i < HashSize; i++)
{
if (vector[i] > 0)
{
fingerprint += 1 << i;
}
}
return fingerprint;
}
} public interface IAnalyser
{ float GetLikenessValue( string needle, string haystack);
} public interface ITokeniser
{ IEnumerable< string > Tokenise( string input);
} public class FixedSizeStringTokeniser : ITokeniser
{ private readonly ushort tokensize = 5;
public FixedSizeStringTokeniser( ushort tokenSize)
{
if (tokenSize < 2 || tokenSize > 127)
{
throw new ArgumentException( "Token 不能超出范围" );
}
this .tokensize = tokenSize;
}
public IEnumerable< string > Tokenise( string input)
{
var chunks = new List< string >();
int offset = 0;
while (offset < input.Length)
{
chunks.Add( new string (input.Skip(offset).Take( this .tokensize).ToArray()));
offset += this .tokensize;
}
return chunks;
}
} public class OverlappingStringTokeniser : ITokeniser
{ private readonly ushort chunkSize = 4;
private readonly ushort overlapSize = 3;
public OverlappingStringTokeniser( ushort chunkSize, ushort overlapSize)
{
if (chunkSize <= overlapSize)
{
throw new ArgumentException( "Chunck 必须大于 overlap" );
}
this .overlapSize = overlapSize;
this .chunkSize = chunkSize;
}
public IEnumerable< string > Tokenise( string input)
{
var result = new List< string >();
int position = 0;
while (position < input.Length - this .chunkSize)
{
result.Add(input.Substring(position, this .chunkSize));
position += this .chunkSize - this .overlapSize;
}
return result;
}
} |
使用:
const string HayStack = "中国***………………" ;
const string Needle = "中国*** 2013………………" ;
IAnalyser analyser = new SimHashAnalyser();
var likeness = analyser.GetLikenessValue(Needle, HayStack);
Console.Clear(); Console.WriteLine( "Likeness: {0}%" , likeness * 100);
Console.ReadKey(); |
本文转自曾祥展博客园博客,原文链接:http://www.cnblogs.com/zengxiangzhan/p/3311114.html,如需转载请自行联系原作者