#!/bin/awk -f # AWK program abridge.awk [ncmax=value] # # Given a newline-separated list of words, abridge them by determining # the longest matching left- and right-substrings, and replacing the # differing parts with a comma-delimited list in braces {}. # If no compression is possible, the lines are passed through unaltered. # The maximum number of characters per output line may be set with the # "ncmax=value" option. # # EXAMPLES # # % echo "a/1/b\na/2/b\na/3/b" | abridge.awk # a/{1,2,3}/b # # % echo "a/1\na/2\na/3" | abridge.awk # a/{1,2,3} # # % echo "1/b\n2/b\n3/b" | abridge.awk # {1,2,3}/b # # % echo "1\n2\n3" | abridge.awk # 1 # 2 # 3 # # % echo "a/12345/b\na/22345/b\na/32345/b" | abridge.awk ncmax=14 # a/{1,2}2345/b # a/{3}2345/b # # Author: Andrew Wittenberg # Version dated 22nov2003 #set max number of characters per line BEGIN {if (ncmax<=0) ncmax=1e6} # Save all lines, and ensure they do not exceed ncmax characters. {line[NR]=$0; if (length($0) > ncmax) {print "ERROR in abridge.awk, pathname exceeds",ncmax,"characters:\n" $0; err = 1; exit; } } # Save first line as a template line, and initialize left/right lengths. NR==1 {ts=$0; n=length($0); nl=n; nr=n} # process remaining lines NR>1 { # find longest matching left substring for this line for(m=0; nl>0 && m<1; nl--) {ls=substr(ts,1,nl); m=gsub("^" ls,"",$0) } # correct for overshoot, and handle the no-left-match case if (m<1) {ls=""} else {nl++}; # find longest matching right substring for this line nr = nr0 && m<1; nr--) {rs=substr(ts,n-nr+1); m=gsub(rs "$","",$0) } # correct for overshoot, and handle the no-right-match case if (m<1) {rs=""} else {nr++}; } # print out the results END { if (err>0) exit 1; # exit if an error was detected if (nl<=0 && nr<=0 || NR==1) # no compression possible -- just pass lines through unaltered {for(i=1;i<=NR;i++) {print line[i]}} else # Print out the results. If the list is too long to fit into a single # Unix word, then split into several words (on separate lines). {nc = 0; #characters printed so far on this line for(i=1;i<=NR;i++) #loop through the input paths { w = substr(line[i],nl+1,length(line[i])-nr-nl); #get next unique word nw = length(w); #number of characters in word if (nc==0) # first word on this line: precede word by left-match & brace {printf("%s{%s",ls,w); nc=nl+1+nw} else if (nc+1+nw+1+nr <= ncmax) # there is room for the current word on this line {printf(",%s",w); nc+=1+nw} #precede word by a comma else # terminate the current line and start a new one {printf("}%s\n%s{%s",rs,ls,w); nc=nl+1+nw} if (i==NR) {print "}" rs; nc+=1+rs} #last word: print final brace } } }