report.rb 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. #!/usr/bin/env ruby
  2. =begin
  3. post_fsd_wiki.phantomjs
  4. this script is used to validate and report statistics on the blacklist entries
  5. it currently detects syntax errors, missing tags, unspecified tags, missing descriptions,
  6. duplicate entries for a single package (partitioning them as identical or differing),
  7. and 'neglected_entries' which are those with no tag, no description, and no replacement
  8. it optionally creates a JSON file with the results
  9. that can be the input to the post_fsd_wiki.phantomjs script
  10. =end
  11. # DEBUG begin
  12. DEBUG = false
  13. require 'byebug' if DEBUG
  14. DEBUG_FILTER_NAMES = [ 'vapoursynth-plugin-fluxsmooth' ]
  15. def IS_DEBUG_FILTER_NAME name ; DEBUG_FILTER_NAMES.include? name ; end ;
  16. # DEBUG end
  17. require 'json'
  18. require 'set'
  19. # entry syntax => original-package:[libre-replacement]:[ref]:[id]:short-description
  20. BLACKLIST_FILES = [ 'blacklist.txt' ]
  21. # BLACKLIST_FILES = [ 'blacklist-testdata.txt' ]
  22. VALID_ENTRIES_REGEX = /^[^:\[\]#]*:[^:\[\]]*:(sv|debian|parabola|fsf|fedora)?:[^:\[\]]*:\w*([^\d:]+:.*|\[[^:]+:.*|[^:]*)$/
  23. BLACKLIST_TAGS_REGEX = /^\[([^\]]*)\] *(.*)/
  24. RAW_ENTRY_KEY = :raw_blacklist_entry
  25. PACKAGE_NAME_KEY = :original_package # per blacklist SYNTAX
  26. REPLACEMENT_KEY = :libre_replacement # per blacklist SYNTAX
  27. REFERENCE_KEY = :ref # per blacklist SYNTAX
  28. ENTRY_ID_KEY = :id # per blacklist SYNTAX
  29. DESCRIPTION_KEY = :short_description # per blacklist SYNTAX
  30. BLACKLIST_TAGS_KEY = :blacklist_tags
  31. NONFREE_TAG = 'nonfree'
  32. SEMIFREE_TAG = 'semifree'
  33. USES_NONFREE_TAG = 'uses-nonfree'
  34. BRANDING_TAG = 'branding'
  35. TECHNICAL_TAG = 'technical'
  36. HAS_REPLACEMENT_TAG = 'FIXME:package'
  37. NEEDS_DESC_TAG = 'FIXME:description'
  38. ACCEPTABLE_TAGS = [ NONFREE_TAG , SEMIFREE_TAG , USES_NONFREE_TAG , BRANDING_TAG ,
  39. TECHNICAL_TAG , HAS_REPLACEMENT_TAG , NEEDS_DESC_TAG ]
  40. DO_PRINT_STATS = true
  41. DO_PRINT_INCOMPLETE = true
  42. DO_PRINT_DUPLICATE = false
  43. REPORT_SEPARATOR = "------------------------------------------------------------\n"
  44. OUTPUT_JSON_FILE = 'blacklist-data.json'
  45. entries_invalid = []
  46. entries = []
  47. entry_freqs = {}
  48. entries_no_desc = []
  49. entries_no_tags = []
  50. entries_unspecified_tags = []
  51. unspecified_tags = Set[]
  52. duplicate_names = nil # deferred
  53. duplicate_identical_entries = {}
  54. duplicate_differing_entries = {}
  55. ## parse data ##
  56. BLACKLIST_FILES.each do | blacklist_filename |
  57. if DEBUG ; print "\nDEBUG: parsing #{blacklist_filename}\n" ; end ;
  58. (File.readlines blacklist_filename).each do | line |
  59. # DEBUG_FILTER_NAMES.each { | ea | debugger if line.start_with? ea }
  60. next if line.strip.empty? || (line.strip.start_with? '#')
  61. entries_invalid << line && next unless line.match VALID_ENTRIES_REGEX
  62. entries << (entry = {})
  63. tokens = (line.split ':')
  64. if DEBUG ; tokens.each_with_index { | token , i | print "DEBUG: tokens[#{i}]=#{token}\n" if IS_DEBUG_FILTER_NAME tokens[0] } ; end ;
  65. entry[RAW_ENTRY_KEY ] = line
  66. entry[PACKAGE_NAME_KEY ] = (tokens.shift ).gsub("\t" , '').strip
  67. entry[REPLACEMENT_KEY ] = (tokens.shift ).gsub("\t" , '').strip
  68. entry[REFERENCE_KEY ] = (tokens.shift ).gsub("\t" , '').strip
  69. entry[ENTRY_ID_KEY ] = (tokens.shift ).gsub("\t" , '').strip
  70. entry[DESCRIPTION_KEY ] = (tokens.join ':').gsub("\t" , '').strip
  71. entry[BLACKLIST_TAGS_KEY] = []
  72. # parse tags
  73. while (entry[DESCRIPTION_KEY].start_with? '[') && (entry[DESCRIPTION_KEY].include? ']')
  74. if DEBUG ; print "\n parsing tag for: #{entry[PACKAGE_NAME_KEY]}\n" ; print "desc IN=#{entry[DESCRIPTION_KEY]}\n" ; end ;
  75. # debugger if DEBUG && (DEBUG_FILTER_NAMES.include? entry[PACKAGE_NAME_KEY])
  76. entry[BLACKLIST_TAGS_KEY] << (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\1')
  77. entry[DESCRIPTION_KEY ] = (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\2')
  78. if DEBUG ; print "desc OUT=#{entry[DESCRIPTION_KEY]}\n" ; print "tags=#{entry[BLACKLIST_TAGS_KEY]}\n" ; sleep 0.2 ; end ;
  79. end
  80. if DEBUG ; print "\nno tag for: #{ entry[PACKAGE_NAME_KEY]}\n" if entry[BLACKLIST_TAGS_KEY].empty? ; end ;
  81. if DEBUG ; print "\nno desc for: #{entry[PACKAGE_NAME_KEY]}\n" if entry[DESCRIPTION_KEY ].empty? ; end ;
  82. end
  83. end
  84. ## process data ##
  85. entries.each do | entry |
  86. if DEBUG && (IS_DEBUG_FILTER_NAME entry[PACKAGE_NAME_KEY]) ; print "\n" ; entry.each_pair { | k , v | print "DEBUG: #{k}: #{v}\n" } ; end ;
  87. entry_name = entry[PACKAGE_NAME_KEY ]
  88. entry_desc = entry[DESCRIPTION_KEY ]
  89. entry_tags = entry[BLACKLIST_TAGS_KEY]
  90. entry_unspecified_tags = entry_tags - ACCEPTABLE_TAGS
  91. entry_freqs[entry_name] = (entry_freqs[entry_name] ||= 0) + 1
  92. entries_no_desc << entry if entry_desc .empty?
  93. entries_no_tags << entry if entry_tags .empty?
  94. entries_unspecified_tags << entry unless entry_unspecified_tags.empty?
  95. unspecified_tags.merge entry_unspecified_tags
  96. end
  97. duplicate_names = entry_freqs.keys.select { | name | entry_freqs[name] > 1 }
  98. incomplete_entries = entries_no_desc + entries_no_tags
  99. unreplaced_entries = entries.select { | entry | entry[REPLACEMENT_KEY].empty? }
  100. neglected_entries = incomplete_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set &
  101. unreplaced_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set
  102. duplicate_names.each do | duplicate_name |
  103. # next unless IS_DEBUG_FILTER_NAME duplicate_name ; p "duplicate_name=#{duplicate_name}" # DEBUG
  104. duplicate_entries = entries.select { | entry | entry[PACKAGE_NAME_KEY] == duplicate_name } \
  105. .map! { | entry | entry[RAW_ENTRY_KEY ] }
  106. unique_entries = duplicate_entries.uniq
  107. n_unique_entries = unique_entries.size
  108. unique_entries.each do | uniq_value |
  109. n_identical_entries = duplicate_entries.count { | dup_entry | dup_entry == uniq_value }
  110. duplicate_identical_entries[duplicate_name] = uniq_value + " (#{n_identical_entries} identical)" if n_identical_entries > 1
  111. end
  112. if n_unique_entries > 1
  113. duplicate_differing_entries[duplicate_name] = unique_entries
  114. end
  115. end
  116. ## report ##
  117. print REPORT_SEPARATOR
  118. print "entries found: #{ (entries + entries_invalid ).size }\n" if DO_PRINT_STATS
  119. print "entries valid: #{ (entries ).size }\n" if DO_PRINT_STATS
  120. print "entries_invalid: #{ (entries_invalid ).size }\n" if DO_PRINT_STATS
  121. print "entries lacking tags: #{ (entries_no_tags ).size }\n" if DO_PRINT_STATS
  122. print "entries lacking description: #{(entries_no_desc ).size }\n" if DO_PRINT_STATS
  123. print "unspecified tags: #{ (unspecified_tags ).size }\n" if DO_PRINT_STATS
  124. print "neglected entries: #{ (neglected_entries ).size }\n" if DO_PRINT_STATS
  125. print "duplicate_names: #{ (duplicate_names ).size }\n" if DO_PRINT_STATS
  126. print " identical entries: #{ (duplicate_identical_entries).size }\n" if DO_PRINT_STATS
  127. print " differing entries: #{ (duplicate_differing_entries).keys.size}\n" if DO_PRINT_STATS
  128. if DO_PRINT_INCOMPLETE
  129. { 'invalid entries' => entries_invalid ,
  130. 'entries lacking description' => entries_no_desc ,
  131. 'entries lacking tags' => entries_no_tags ,
  132. 'entries with unspecified tags' => entries_unspecified_tags ,
  133. 'unspecified tags' => unspecified_tags }.each_pair do | label , data |
  134. print REPORT_SEPARATOR + "#{label}:\n" unless data.empty?
  135. data.each { | entry | print " #{((entry.is_a? Hash) ? entry[RAW_ENTRY_KEY] : entry).strip}\n" }
  136. end
  137. end
  138. unless neglected_entries.empty? || ! DO_PRINT_INCOMPLETE
  139. print REPORT_SEPARATOR + "neglected entries:\n"
  140. neglected_entries.each { | entry_name | print " #{entry_name}\n" }
  141. end
  142. unless duplicate_names.empty? || ! DO_PRINT_DUPLICATE
  143. print REPORT_SEPARATOR + "duplicate entries:\n"
  144. duplicate_names.each do | duplicate_name |
  145. identical_entry = duplicate_identical_entries[duplicate_name]
  146. differing_entries = duplicate_differing_entries[duplicate_name]
  147. print "\n #{duplicate_name}:\n"
  148. print " identical entries:\n" unless identical_entry .nil?
  149. print " #{identical_entry}\n" unless identical_entry .nil?
  150. print " differing entries:\n" unless differing_entries.nil?
  151. differing_entries.each { | entry | print " #{entry}\n" } unless differing_entries.nil?
  152. end
  153. end
  154. ## sanity check ##
  155. should_quit = ! (entries_invalid.empty? && unspecified_tags.empty? && duplicate_names.empty?)
  156. (print "errors were found - JSON will not be generated\n" ; exit 1) if should_quit
  157. ## generate JSON ##
  158. IO.write OUTPUT_JSON_FILE , entries.to_json ; print "\nwrote: #{OUTPUT_JSON_FILE}\n\ndone\n" ;