Python 利用字符串相似性算法进行地址的模糊匹配
在改造和集成项目中,经常需要搞清楚不同厂家对同一个设备信号,地址,变量命名的不同名字,并形成对照表。这种脏活累活,需要比较智能的工具来提升效率。利用字符串相似性算法来进行模糊匹配,是一个不错的思路。如何把人工总结的规律告诉字符串相似性算法呢?在应用字符串相似性算法之前,进行几次替换就行了。
import re
import csv
import difflib
# Pyhon 利用字符串相似性算法进行模糊匹配
# 帮助我们找到同一个事物的不同名字
# 找到OpcFile 中与RtdbFile 每行最接近的那一行
RtdbFile = r'rtdb_path.txt'
OpcFile = r'opc_path.txt'
RtdbPathList = sorted(open(RtdbFile).readlines())
OpcPathList = open(OpcFile).readlines()
# 定义已经明确的变换
rules = [
('BGV1.MMSState1','Stand19'),
('BGV1.MMSState2','Stand20'),
('.FFB1.MMSState1', 'Stand21'),
('.FFB1.MMSState2', 'Stand22'),
('.FFB2.MMSState1', 'Stand23'),
('.FFB2.MMSState2', 'Stand24'),
('.FFB3.MMSState1', 'Stand25'),
('.FFB3.MMSState2', 'Stand26'),
('.FFB4.MMSState1', 'Stand27'),
('.FFB4.MMSState2', 'Stand28'),
('.TMB1', 'Stand29'),
('.TMB2', 'Stand30'),
('.TMB3', 'Stand31'),
('.TMB4', 'Stand32'),
('BlowerState', 'LCVB')
]
reverse_rules = [
("Data.ST", "Data.Stand"),
("Data.SH", "Data.Shear"),
("mmsData.",''),
("RT.","RollerTable.")
]
# 定义没有意义的字符段
junkLbda = [ 'Data', 'WireLine.','WireLineExit.','PrefinishingMill.',':','WRM5.','MMSState.','MMSState1.','MMSState2.' ,' ']
OpcList = sorted(list(map(lambda x:x.strip(),filter(lambda x: 'mmsData' in x and not 'Comm' in x and '{' in x,OpcPathList))))
RtdbPathList = list(map(lambda x:x.strip(), RtdbPathList))
for i in RtdbPathList:
ii = i
for x,y in rules:
ii = ii.replace(x,y)
for j in junkLbda:
ii = ii.replace(j, '')
sim = difflib.SequenceMatcher(None, i, OpcList[0]).ratio()
simLine = OpcList[0]
for j in OpcList:
jj = j
if "Data.LP" in jj:
# 在正则表达式替换时进行简单的数学运算
jj = re.sub(r"Data\.LP(\d+)",lambda exp: f'Stand{int(exp.groups()[0])-1}.MMSStateLp',jj)
for x,y in reverse_rules:
jj = jj.replace(x,y)
s = difflib.SequenceMatcher(None, ii, jj).ratio()
if s> sim:
sim = s
simLine = j
#OpcList.remove(simLine)
print(f'{i},{simLine},{sim}')