How to parse huge XML files with Ruby and Nokogiri (without using too much RAM)
Parse huge XML files, without using too much RAM, with Ruby and Nokogiri and the following code:
require 'nokogiri'
# Public: BigXML helps you parse XML efficiently with minimal RAM usage. Parse 1GB, 2GB, 100GB, whatever and so on..
#
# Examples:
# # Filter an XML file efficiently by selecting only users, groups and messages.
# File.open(ARGV[1], 'w') do |out_file|
# xml = BigXML.new(ARGV[0])
# xml.each_node do |node, path|
# # users
# if node.name == 'user' # or use the element's path: path == 'export/users/user'
# out_file << node.outer_xml
# # groups
# elsif node.name == 'group' # or use the element's path and content: path == 'export/groups/group' && node.outer_xml.match(/<private type="boolean">false/m)
# out_file << node.outer_xml
# # messages
# elsif node.name == 'message' # or use the element's path and content: path == 'export/messages/message' && node.outer_xml.match(/<private type="boolean">false/m)
# out_file << node.outer_xml
# end
# end
# end
#
class BigXML
# Public: Initializes a parser.
#
# xml_file - The path of the XML file you want to parse
def initialize(xml_file)
raise ArgumentError, "Please provide the path of the XML file, not a #{xml_file.class}" unless xml_file.is_a?(String)
@xml_file = xml_file
end
# Public: Iterate over each node in the XML document.
#
# attributes_in_path - Default false. Setting this to true will include attributes in the node path, e.g. /groups/@id=1. instead of just /groups
#
# Yields the node (Nokogiri::XML::Reader) and path (String) of the current XML node.
#
# Returns nothing.
def each_node(attributes_in_path=false)
reader = Nokogiri::XML::Reader(File.open(@xml_file))
nodes = ['']
reader.each do |node|
# start tag
if node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT.
# store path
if attributes_in_path && node.attributes.size > 0
attributes = []
node.attributes.sort.each do |name, value|
attributes << "@#{name}=#{value}"
end
nodes << "#{node.name}/#{attributes.join('/')}"
else
nodes << node.name
end
path = nodes.join('/')
yield node, path
end
# end tag
if node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT || node.self_closing?
nodes.pop
end
end
end
end
if __FILE__ == $0
require 'minitest/unit'
class TestBigXML < MiniTest::Unit::TestCase
def test_grep
xml = BigXML.new(ARGV[0])
users = 0
groups = 0
messages = 0
File.open(ARGV[1], 'w') do |out_file|
xml.each_node do |node, path|
# users
if node.name == 'user' && path == '/export/users/user'
users += 1
out_file << node.outer_xml
out_file << "\n"
# groups
elsif node.name == 'group' && path == '/export/groups/group'.
doc = Nokogiri::XML.parse(node.outer_xml)
group = doc.at('/group/private')
is_public = group && group.inner_text == 'false'
if is_public
groups += 1
out_file << node.outer_xml
out_file << "\n"
end
# messages
elsif node.name == 'message' && path == '/export/messages/message'
doc = Nokogiri::XML.parse(node.outer_xml)
group = doc.at('/message/group/private')
is_public = group && group.inner_text == 'false'
if is_public
messages += 1
out_file << node.outer_xml
out_file << "\n"
end
end
end
assert_equal 100, users
assert_equal 100, groups
assert_equal 100, messages
end
end
end
MiniTest::Unit.autorun
end